def test_list_of_rows_table(): data = [["John Coltrane", 1926, 8.65, False], ["Miles Davis", 1926, 9.87, False], ["Bill Evans", 1929, 7.65, False], ["Paul Chambers", 1935, 5.14, False], ["Jimmy Cobb", 1929, 5.78, True], ["Scott LaFaro", 1936, 4.21, False], ["Sonny Rollins", 1930, 8.99, True], ["Kenny Burrel", 1931, 4.37, True]] cols = [Column(id='1', name='Name', columnType='STRING'), Column(id='2', name='Born', columnType='INTEGER'), Column(id='3', name='Hipness', columnType='DOUBLE'), Column(id='4', name='Living', columnType='BOOLEAN')] schema1 = Schema(name='Jazz Guys', columns=cols, id="syn1000002", parent="syn1000001") # need columns to do cast_values w/o storing table = Table(schema1, data, headers=[SelectColumn.from_column(col) for col in cols]) for table_row, expected_row in zip(table, data): assert table_row == expected_row rowset = table.asRowSet() for rowset_row, expected_row in zip(rowset.rows, data): assert rowset_row['values'] == expected_row table.columns = cols df = table.asDataFrame() assert list(df['Name']) == [r[0] for r in data]
def test_list_of_rows_table(): data = [["John Coltrane", 1926, 8.65, False], ["Miles Davis", 1926, 9.87, False], ["Bill Evans", 1929, 7.65, False], ["Paul Chambers", 1935, 5.14, False], ["Jimmy Cobb", 1929, 5.78, True], ["Scott LaFaro", 1936, 4.21, False], ["Sonny Rollins", 1930, 8.99, True], ["Kenny Burrel", 1931, 4.37, True]] cols = [Column(id='1', name='Name', columnType='STRING'), Column(id='2', name='Born', columnType='INTEGER'), Column(id='3', name='Hipness', columnType='DOUBLE'), Column(id='4', name='Living', columnType='BOOLEAN')] schema1 = Schema(name='Jazz Guys', columns=cols, id="syn1000002", parent="syn1000001") # need columns to do cast_values w/o storing table = Table(schema1, data, headers=[SelectColumn.from_column(col) for col in cols]) for table_row, expected_row in zip(table, data): assert_equals(table_row, expected_row) rowset = table.asRowSet() for rowset_row, expected_row in zip(rowset.rows, data): assert_equals(rowset_row['values'], expected_row) table.columns = cols df = table.asDataFrame() assert_equals(list(df['Name']), [r[0] for r in data])
def test_synapse_integer_columns_with_missing_values_from_dataframe(): #SYNPY-267 cols = [ Column(name='x', columnType='STRING'), Column(name='y', columnType='INTEGER'), Column(name='z', columnType='DOUBLE') ] schema = syn.store(Schema(name='Big Table', columns=cols, parent=project)) ## write rows to CSV file with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as temp: schedule_for_cleanup(temp.name) #2nd row is missing a value in its integer column temp.write('x,y,z\na,1,0.9\nb,,0.8\nc,3,0.7\n') temp.flush() filename = temp.name #create a table from csv table = Table(schema, filename) df = table.asDataFrame() table_from_dataframe = Table(schema, df) assert_not_equal(table.filepath, table_from_dataframe.filepath) #compare to make sure no .0's were appended to the integers assert filecmp.cmp(table.filepath, table_from_dataframe.filepath)
def test_pandas_to_table(): pd = _try_import_pandas('test_pandas_to_table') df = pd.DataFrame(dict(a=[1, 2, 3], b=["c", "d", "e"])) schema = Schema(name="Baz", parent="syn12345", columns=as_table_columns(df)) print("\n", df, "\n\n") ## A dataframe with no row id and version table = Table(schema, df) for i, row in enumerate(table): print(row) assert row[0] == (i + 1) assert row[1] == ["c", "d", "e"][i] assert len(table) == 3 ## If includeRowIdAndRowVersion=True, include empty row id an versions ## ROW_ID,ROW_VERSION,a,b ## ,,1,c ## ,,2,d ## ,,3,e table = Table(schema, df, includeRowIdAndRowVersion=True) for i, row in enumerate(table): print(row) assert row[0] is None assert row[1] is None assert row[2] == (i + 1) ## A dataframe with no row id and version df = pd.DataFrame(index=["1_7", "2_7", "3_8"], data=dict(a=[100, 200, 300], b=["c", "d", "e"])) print("\n", df, "\n\n") table = Table(schema, df) for i, row in enumerate(table): print(row) assert row[0] == ["1", "2", "3"][i] assert row[1] == ["7", "7", "8"][i] assert row[2] == (i + 1) * 100 assert row[3] == ["c", "d", "e"][i] ## A dataframe with row id and version in columns df = pd.DataFrame( dict(ROW_ID=["0", "1", "2"], ROW_VERSION=["8", "9", "9"], a=[100, 200, 300], b=["c", "d", "e"])) print("\n", df, "\n\n") table = Table(schema, df) for i, row in enumerate(table): print(row) assert row[0] == ["0", "1", "2"][i] assert row[1] == ["8", "9", "9"][i] assert row[2] == (i + 1) * 100 assert row[3] == ["c", "d", "e"][i]
def test_pandas_to_table(): df = pd.DataFrame(dict(a=[1, 2, 3], b=["c", "d", "e"])) schema = Schema(name="Baz", parent="syn12345", columns=as_table_columns(df)) # A dataframe with no row id and version table = Table(schema, df) for i, row in enumerate(table): assert_equals(row[0], (i + 1)) assert_equals(row[1], ["c", "d", "e"][i]) assert_equals(len(table), 3) # If includeRowIdAndRowVersion=True, include empty row id an versions # ROW_ID,ROW_VERSION,a,b # ,,1,c # ,,2,d # ,,3,e table = Table(schema, df, includeRowIdAndRowVersion=True) for i, row in enumerate(table): assert_is_none(row[0]) assert_is_none(row[1]) assert_equals(row[2], (i + 1)) # A dataframe with no row id and version df = pd.DataFrame(index=["1_7", "2_7", "3_8"], data=dict(a=[100, 200, 300], b=["c", "d", "e"])) table = Table(schema, df) for i, row in enumerate(table): assert_equals(row[0], ["1", "2", "3"][i]) assert_equals(row[1], ["7", "7", "8"][i]) assert_equals(row[2], (i + 1) * 100) assert_equals(row[3], ["c", "d", "e"][i]) # A dataframe with row id and version in columns df = pd.DataFrame( dict(ROW_ID=["0", "1", "2"], ROW_VERSION=["8", "9", "9"], a=[100, 200, 300], b=["c", "d", "e"])) table = Table(schema, df) for i, row in enumerate(table): assert_equals(row[0], ["0", "1", "2"][i]) assert_equals(row[1], ["8", "9", "9"][i]) assert_equals(row[2], (i + 1) * 100) assert_equals(row[3], ["c", "d", "e"][i])
def test_RowSetTable(): row_set_json = { 'etag': 'aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee', 'headers': [ {'columnType': 'STRING', 'id': '353', 'name': 'name'}, {'columnType': 'DOUBLE', 'id': '355', 'name': 'x'}, {'columnType': 'DOUBLE', 'id': '3020', 'name': 'y'}, {'columnType': 'INTEGER', 'id': '891', 'name': 'n'}], 'rows': [{ 'rowId': 5, 'values': ['foo', '1.23', '2.2', '101'], 'versionNumber': 3}, {'rowId': 6, 'values': ['bar', '1.34', '2.4', '101'], 'versionNumber': 3}, {'rowId': 7, 'values': ['foo', '1.23', '2.2', '101'], 'versionNumber': 4}, {'rowId': 8, 'values': ['qux', '1.23', '2.2', '102'], 'versionNumber': 3}], 'tableId': 'syn2976298'} row_set = RowSet.from_json(row_set_json) assert row_set.etag == 'aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee' assert row_set.tableId == 'syn2976298' assert len(row_set.headers) == 4 assert len(row_set.rows) == 4 schema = Schema(id="syn2976298", name="Bogus Schema", columns=[353,355,3020,891], parent="syn1000001") table = Table(schema, row_set) assert table.etag == 'aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee' assert table.tableId == 'syn2976298' assert len(table.headers) == 4 assert len(table.asRowSet().rows) == 4 try: import pandas as pd df = table.asDataFrame() assert df.shape == (4,4) assert all(df['name'] == ['foo', 'bar', 'foo', 'qux']) except ImportError as e1: sys.stderr.write('Pandas is apparently not installed, skipping part of test_RowSetTable.\n\n')
def test_tables_csv(): # Define schema cols = [ Column(name='Name', columnType='STRING'), Column(name='Born', columnType='INTEGER'), Column(name='Hipness', columnType='DOUBLE'), Column(name='Living', columnType='BOOLEAN') ] schema = Schema(name='Jazz Guys', columns=cols, parent=project) data = [["John Coltrane", 1926, 8.65, False], ["Miles Davis", 1926, 9.87, False], ["Bill Evans", 1929, 7.65, False], ["Paul Chambers", 1935, 5.14, False], ["Jimmy Cobb", 1929, 5.78, True], ["Scott LaFaro", 1936, 4.21, False], ["Sonny Rollins", 1930, 8.99, True], ["Kenny Burrel", 1931, 4.37, True]] # the following creates a CSV file and uploads it to create a new table table = syn.store(Table(schema, data)) # Query and download an identical CSV results = syn.tableQuery("select * from %s" % table.schema.id, resultsAs="csv", includeRowIdAndRowVersion=False) # Test that CSV file came back as expected for expected_row, row in zip(data, results): assert_equals(expected_row, row, "expected %s but got %s" % (expected_row, row))
def test_tables_pandas(): try: ## check if we have pandas import pandas as pd ## create a pandas DataFrame df = pd.DataFrame({ 'A': ("foo", "bar", "baz", "qux", "asdf"), 'B': tuple(math.pi * i for i in range(5)), 'C': (101, 202, 303, 404, 505), 'D': (False, True, False, True, False) }) cols = as_table_columns(df) cols[0].maximumSize = 20 schema = Schema(name="Nifty Table", columns=cols, parent=project) ## store in Synapse table = syn.store(Table(schema, df)) ## retrieve the table and verify results = syn.tableQuery('select * from %s' % table.schema.id) df2 = results.asDataFrame() ## simulate rowId-version rownames for comparison df.index = ['%s_0' % i for i in range(5)] assert all(df2 == df) except ImportError as e1: sys.stderr.write( 'Pandas is apparently not installed, skipping test_tables_pandas.\n\n' )
def test_build_table_download_file_handle_list__repeated_file_handles(): syn = synapseclient.client.Synapse(debug=True, skip_checks=True) #patch the cache so we don't look there in case FileHandle ids actually exist there patch.object(syn.cache, "get", return_value=None) cols = [ Column(name='Name', columnType='STRING', maximumSize=50), Column(name='filehandle', columnType='FILEHANDLEID') ] schema = Schema(name='FileHandleTest', columns=cols, parent='syn420') #using some large filehandle numbers so i don data = [["ayy lmao", 5318008], ["large numberino", 0x5f3759df], ["repeated file handle", 5318008], ["repeated file handle also", 0x5f3759df]] ## need columns to do cast_values w/o storing table = Table(schema, data, headers=[SelectColumn.from_column(col) for col in cols]) file_handle_associations, file_handle_to_path_map = syn._build_table_download_file_handle_list( table, ['filehandle']) #verify only 2 file_handles are added (repeats were ignored) assert_equals(2, len(file_handle_associations)) assert_equals(0, len(file_handle_to_path_map)) #might as well check anyways
def test_list_of_rows_table(): data = [["John Coltrane", 1926, 8.65, False], ["Miles Davis", 1926, 9.87, False], ["Bill Evans", 1929, 7.65, False], ["Paul Chambers", 1935, 5.14, False], ["Jimmy Cobb", 1929, 5.78, True], ["Scott LaFaro", 1936, 4.21, False], ["Sonny Rollins", 1930, 8.99, True], ["Kenny Burrel", 1931, 4.37, True]] cols = [] cols.append(Column(id='1', name='Name', columnType='STRING')) cols.append(Column(id='2', name='Born', columnType='INTEGER')) cols.append(Column(id='3', name='Hipness', columnType='DOUBLE')) cols.append(Column(id='4', name='Living', columnType='BOOLEAN')) schema1 = Schema(name='Jazz Guys', columns=cols, id="syn1000002", parent="syn1000001") ## need columns to do cast_values w/o storing table = Table(schema1, data, headers=[SelectColumn.from_column(col) for col in cols]) for table_row, expected_row in zip(table, data): assert table_row == expected_row rowset = table.asRowSet() for rowset_row, expected_row in zip(rowset.rows, data): assert rowset_row['values'] == expected_row table.columns = cols ## test asDataFrame try: import pandas as pd df = table.asDataFrame() assert all(df['Name'] == [r[0] for r in data]) except ImportError as e1: sys.stderr.write( 'Pandas is apparently not installed, skipping asDataFrame portion of test_list_of_rows_table.\n\n' )
def test_table_file_view_csv_update_annotations__includeEntityEtag(): folder = syn.store( synapseclient.Folder(name="updateAnnoFolder" + str(uuid.uuid4()), parent=project)) anno1_name = "annotationColumn1" anno2_name = "annotationColumn2" initial_annotations = { anno1_name: "initial_value1", anno2_name: "initial_value2" } file_entity = syn.store( File(name= "test_table_file_view_csv_update_annotations__includeEntityEtag", path="~/fakepath", synapseStore=False, parent=folder, annotations=initial_annotations)) annotation_columns = [ Column(name=anno1_name, columnType='STRING'), Column(name=anno2_name, columnType='STRING') ] entity_view = syn.store( EntityViewSchema(name="TestEntityViewSchemaUpdateAnnotation" + str(uuid.uuid4()), parent=project, scopes=[folder], columns=annotation_columns)) query_str = "SELECT {anno1}, {anno2} FROM {proj_id}".format( anno1=anno1_name, anno2=anno2_name, proj_id=utils.id_of(entity_view)) #modify first annotation using rowset rowset_query_result = syn.tableQuery(query_str, resultsAs="rowset") rowset = rowset_query_result.asRowSet() rowset_changed_anno_value = "rowset_value_change" rowset.rows[0].values[0] = rowset_changed_anno_value syn.store(rowset) #modify second annotation using csv csv_query_result = syn.tableQuery(query_str, resultsAs="csv") dataframe = csv_query_result.asDataFrame() csv_changed_anno_value = "csv_value_change" dataframe.ix[0, anno2_name] = csv_changed_anno_value syn.store(Table(utils.id_of(entity_view), dataframe)) #check annotations in the file entity. Annotations may not be immediately updated so we wait in while loop expected_annotations = { anno1_name: [rowset_changed_anno_value], anno2_name: [csv_changed_anno_value] } start_time = time.time() while (expected_annotations != file_entity.annotations): assert_less(time.time() - start_time, QUERY_TIMEOUT_SEC) time.sleep(2) file_entity = syn.get(file_entity, downloadFile=False)
def test_tables_pandas(): try: ## check if we have pandas import pandas as pd #import numpy for datatypes import numpy as np ## create a pandas DataFrame df = pd.DataFrame({ 'A': ("foo", "bar", "baz", "qux", "asdf"), 'B': tuple(0.42 * i for i in range(5)), 'C': (101, 202, 303, 404, 505), 'D': (False, True, False, True, False), # additional data types supported since SYNPY-347 'int64': tuple(np.int64(range(5))), 'datetime64': tuple( np.datetime64(d) for d in [ '2005-02-01', '2005-02-02', '2005-02-03', '2005-02-04', '2005-02-05' ]), 'string_': tuple( np.string_(s) for s in ['urgot', 'has', 'dark', 'mysterious', 'past']) }) cols = as_table_columns(df) cols[0].maximumSize = 20 schema = Schema(name="Nifty Table", columns=cols, parent=project) ## store in Synapse table = syn.store(Table(schema, df)) ## retrieve the table and verify results = syn.tableQuery('select * from %s' % table.schema.id, resultsAs='csv') df2 = results.asDataFrame(convert_to_datetime=True) ## simulate rowId-version rownames for comparison df.index = ['%s_0' % i for i in range(5)] #for python3 we need to convert from numpy.bytes_ to str or the equivalence comparision fails if six.PY3: df['string_'] = df['string_'].transform(str) # df2 == df gives Dataframe of boolean values; first .all() gives a Series object of ANDed booleans of each column; second .all() gives a bool that is ANDed value of that Series assert (df2 == df).all().all() except ImportError as e1: sys.stderr.write( 'Pandas is apparently not installed, skipping test_tables_pandas.\n\n' )
def write_synapse_table(table_data, synapse_project_id, table_name='', username='', password=''): """ Write data to a Synapse table. Parameters ---------- table_data : Pandas DataFrame Synapse table contents synapse_project_id : string Synapse ID for project within which table is to be written table_name : string schema name of table username : string Synapse username (only needed once on a given machine) password : string Synapse password (only needed once on a given machine) Examples -------- >>> from mhealthx.io_data import read_synapse_table_files, write_synapse_table >>> in_synapse_table_id = 'syn4590865' >>> synapse_project_id = 'syn4899451' >>> column_names = [] >>> download_limit = None >>> out_path = '.' >>> username = '' >>> password = '' >>> table_data, files = read_synapse_table_files(in_synapse_table_id, column_names, download_limit, out_path, username, password) >>> table_name = 'Contents of ' + in_synapse_table_id >>> write_synapse_table(table_data, synapse_project_id, table_name, username, password) """ import synapseclient from synapseclient import Schema, Table, as_table_columns syn = synapseclient.Synapse() # Log in to Synapse: if username and password: syn.login(username, password) else: syn.login() table_data.index = range(table_data.shape[0]) schema = Schema(name=table_name, columns=as_table_columns(table_data), parent=synapse_project_id, includeRowIdAndRowVersion=False) syn.store(Table(schema, table_data))
def test_RowSetTable(): row_set_json = { 'etag': 'aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee', 'headers': [ {'columnType': 'STRING', 'id': '353', 'name': 'name'}, {'columnType': 'DOUBLE', 'id': '355', 'name': 'x'}, {'columnType': 'DOUBLE', 'id': '3020', 'name': 'y'}, {'columnType': 'INTEGER', 'id': '891', 'name': 'n'}], 'rows': [{ 'rowId': 5, 'values': ['foo', '1.23', '2.2', '101'], 'versionNumber': 3}, {'rowId': 6, 'values': ['bar', '1.34', '2.4', '101'], 'versionNumber': 3}, {'rowId': 7, 'values': ['foo', '1.23', '2.2', '101'], 'versionNumber': 4}, {'rowId': 8, 'values': ['qux', '1.23', '2.2', '102'], 'versionNumber': 3}], 'tableId': 'syn2976298'} row_set = RowSet.from_json(row_set_json) assert_equals(row_set.etag, 'aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee') assert_equals(row_set.tableId, 'syn2976298') assert_equals(len(row_set.headers), 4) assert_equals(len(row_set.rows), 4) schema = Schema(id="syn2976298", name="Bogus Schema", columns=[353, 355, 3020, 891], parent="syn1000001") table = Table(schema, row_set) assert_equals(table.etag, 'aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee') assert_equals(table.tableId, 'syn2976298') assert_equals(len(table.headers), 4) assert_equals(len(table.asRowSet().rows), 4) df = table.asDataFrame() assert_equals(df.shape, (4, 4)) assert_equals(list(df['name']), ['foo', 'bar', 'foo', 'qux'])
def test_store_table_datetime(): current_datetime = datetime.fromtimestamp(round(time.time(), 3)) schema = syn.store( Schema("testTable", [Column(name="testerino", columnType='DATE')], project)) rowset = RowSet(rows=[Row([current_datetime])], schema=schema) rowset_table = syn.store(Table(schema, rowset)) query_result = syn.tableQuery("select * from %s" % id_of(schema), resultsAs="rowset") assert_equals(current_datetime, query_result.rowset['rows'][0]['values'][0])
def test_synapse_integer_columns_with_missing_values_from_dataframe(): # SYNPY-267 cols = [ Column(name='x', columnType='STRING'), Column(name='y', columnType='INTEGER'), Column(name='z', columnType='DOUBLE') ] schema = syn.store(Schema(name='Big Table', columns=cols, parent=project)) line_terminator = str(os.linesep) # write rows to CSV file with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as temp: schedule_for_cleanup(temp.name) # 2nd row is missing a value in its integer column temp.write('x,y,z' + line_terminator + 'a,1,0.9' + line_terminator + 'b,,0.8' + line_terminator + 'c,3,0.7' + line_terminator) temp.flush() filename = temp.name # create a table from csv table = Table(schema, filename) df = table.asDataFrame() table_from_dataframe = Table(schema, df) assert_not_equal(table.filepath, table_from_dataframe.filepath) df2 = table_from_dataframe.asDataFrame() assert_frame_equal(df, df2)
def test_dict_to_table(): d = dict(a=[1, 2, 3], b=["c", "d", "e"]) df = pd.DataFrame(d) schema = Schema(name="Baz", parent="syn12345", columns=as_table_columns(df)) with patch.object(CsvFileTable, "from_data_frame") as mocked_from_data_frame: Table(schema, d) # call_agrs is a tuple with values and name agrs_list = mocked_from_data_frame.call_args[0] # getting the second argument df_agr = agrs_list[1] assert df_agr.equals(df)
def test_list_of_rows_table(): data = [["John Coltrane", 1926, 8.65, False], ["Miles Davis", 1926, 9.87, False], ["Bill Evans", 1929, 7.65, False], ["Paul Chambers", 1935, 5.14, False], ["Jimmy Cobb", 1929, 5.78, True], ["Scott LaFaro", 1936, 4.21, False], ["Sonny Rollins", 1930, 8.99, True], ["Kenny Burrel", 1931, 4.37, True]] cols = [] cols.append(Column(id='1', name='Name', columnType='STRING')) cols.append(Column(id='2', name='Born', columnType='INTEGER')) cols.append(Column(id='3', name='Hipness', columnType='DOUBLE')) cols.append(Column(id='4', name='Living', columnType='BOOLEAN')) schema1 = Schema(name='Jazz Guys', columns=cols, id="syn1000002", parent="syn1000001") ## need columns to do cast_values w/o storing table = Table(schema1, data, headers=[SelectColumn.from_column(col) for col in cols]) for table_row, expected_row in izip(table, data): assert table_row==expected_row rowset = table.asRowSet() for rowset_row, expected_row in izip(rowset.rows, data): assert rowset_row['values']==expected_row table.columns = cols ## test asDataFrame try: import pandas as pd df = table.asDataFrame() assert all(df['Name'] == [r[0] for r in data]) except ImportError as e1: sys.stderr.write('Pandas is apparently not installed, skipping asDataFrame portion of test_list_of_rows_table.\n\n')
def test_tables_pandas(): # create a pandas DataFrame df = pd.DataFrame({ 'A': ("foo", "bar", "baz", "qux", "asdf"), 'B': tuple(0.42 * i for i in range(5)), 'C': (101, 202, 303, 404, 505), 'D': (False, True, False, True, False), # additional data types supported since SYNPY-347 'int64': tuple(np.int64(range(5))), 'datetime64': tuple( np.datetime64(d) for d in [ '2005-02-01', '2005-02-02', '2005-02-03', '2005-02-04', '2005-02-05' ]), 'string_': tuple( np.string_(s) for s in ['urgot', 'has', 'dark', 'mysterious', 'past']) }) cols = as_table_columns(df) cols[0].maximumSize = 20 schema = Schema(name="Nifty Table", columns=cols, parent=project) # store in Synapse table = syn.store(Table(schema, df)) # retrieve the table and verify results = syn.tableQuery('select * from %s' % table.schema.id, resultsAs='csv') df2 = results.asDataFrame(convert_to_datetime=True) # simulate rowId-version rownames for comparison df.index = ['%s_0' % i for i in range(5)] # for python3 we need to convert from numpy.bytes_ to str or the equivalence comparision fails if six.PY3: df['string_'] = df['string_'].transform(str) # SYNPY-717 df['datetime64'] = df['datetime64'].apply( lambda x: pd.Timestamp(x).tz_localize('UTC')) # df2 == df gives Dataframe of boolean values; first .all() gives a Series object of ANDed booleans of each column; # second .all() gives a bool that is ANDed value of that Series assert_frame_equal(df2, df)
def test_RowSetTable(): row_set_json = { 'etag': 'aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee', 'headers': [ {'columnType': 'STRING', 'id': '353', 'name': 'name'}, {'columnType': 'DOUBLE', 'id': '355', 'name': 'x'}, {'columnType': 'DOUBLE', 'id': '3020', 'name': 'y'}, {'columnType': 'INTEGER', 'id': '891', 'name': 'n'}], 'rows': [{ 'rowId': 5, 'values': ['foo', '1.23', '2.2', '101'], 'versionNumber': 3}, {'rowId': 6, 'values': ['bar', '1.34', '2.4', '101'], 'versionNumber': 3}, {'rowId': 7, 'values': ['foo', '1.23', '2.2', '101'], 'versionNumber': 4}, {'rowId': 8, 'values': ['qux', '1.23', '2.2', '102'], 'versionNumber': 3}], 'tableId': 'syn2976298'} row_set = RowSet.from_json(row_set_json) assert row_set.etag == 'aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee' assert row_set.tableId == 'syn2976298' assert len(row_set.headers) == 4 assert len(row_set.rows) == 4 schema = Schema(id="syn2976298", name="Bogus Schema", columns=[353, 355, 3020, 891], parent="syn1000001") table = Table(schema, row_set) assert table.etag == 'aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee' assert table.tableId == 'syn2976298' assert len(table.headers) == 4 assert len(table.asRowSet().rows) == 4 df = table.asDataFrame() assert df.shape == (4, 4) assert list(df['name']) == ['foo', 'bar', 'foo', 'qux']
def test_tables_csv(): ## Define schema cols = [] cols.append(Column(name='Name', columnType='STRING')) cols.append(Column(name='Born', columnType='INTEGER')) cols.append(Column(name='Hipness', columnType='DOUBLE')) cols.append(Column(name='Living', columnType='BOOLEAN')) schema = Schema(name='Jazz Guys', columns=cols, parent=project) data = [["John Coltrane", 1926, 8.65, False], ["Miles Davis", 1926, 9.87, False], ["Bill Evans", 1929, 7.65, False], ["Paul Chambers", 1935, 5.14, False], ["Jimmy Cobb", 1929, 5.78, True], ["Scott LaFaro", 1936, 4.21, False], ["Sonny Rollins", 1930, 8.99, True], ["Kenny Burrel", 1931, 4.37, True]] ## the following creates a CSV file and uploads it to create a new table table = syn.store(Table(schema, data)) ## Query and download an identical CSV results = syn.tableQuery("select * from %s" % table.schema.id, resultsAs="csv", includeRowIdAndRowVersion=False) ## Test that CSV file came back as expected for expected_row, row in zip(data, results): assert expected_row == row, "expected %s but got %s" % (expected_row, row) try: ## check if we have pandas import pandas as pd df = results.asDataFrame() assert all(df.columns.values == ['Name', 'Born', 'Hipness', 'Living']) assert list(df.iloc[1, [0, 1, 3]]) == ['Miles Davis', 1926, False] assert df.iloc[1, 2] - 9.87 < 0.0001 except ImportError as e1: sys.stderr.write( 'Pandas is apparently not installed, skipping test of .asDataFrame for CSV tables.\n\n' ) ## Aggregate query expected = {True: [True, 1929, 3, 6.38], False: [False, 1926, 5, 7.104]} results = syn.tableQuery( 'select Living, min(Born), count(Living), avg(Hipness) from %s group by Living' % table.schema.id, resultsAs="csv", includeRowIdAndRowVersion=False) for row in results: living = row[0] assert expected[living][1] == row[1] assert expected[living][2] == row[2] assert abs(expected[living][3] - row[3]) < 0.0001 ## Aggregate query results to DataFrame try: ## check if we have pandas import pandas as pd df = results.asDataFrame() assert all(expected[df.iloc[0, 0]][0:3] == df.iloc[0, 0:3]) assert abs(expected[df.iloc[1, 0]][3] - df.iloc[1, 3]) < 0.0001 except ImportError as e1: sys.stderr.write( 'Pandas is apparently not installed, skipping test of .asDataFrame for aggregate queries as CSV tables.\n\n' ) ## Append rows more_jazz_guys = [["Sonny Clark", 1931, 8.43, False], ["Hank Mobley", 1930, 5.67, False], ["Freddie Hubbard", 1938, float('nan'), False], ["Thelonious Monk", 1917, float('inf'), False]] table = syn.store(Table(table.schema, more_jazz_guys)) ## test that CSV file now has more jazz guys results = syn.tableQuery("select * from %s" % table.schema.id, resultsAs="csv") for expected_row, row in zip(data + more_jazz_guys, results): for field, expected_field in zip(row[2:], expected_row): if type(field) is float and math.isnan(field): assert type(expected_field) is float and math.isnan( expected_field) elif type(expected_field) is float and math.isnan(expected_field): assert type(field) is float and math.isnan(field) else: assert expected_field == field ## Update as a RowSet rowset = results.asRowSet() for row in rowset['rows']: if row['values'][1] == 1930: row['values'][2] = 8.5 row_reference_set = syn.store(rowset) ## aggregate queries won't return row id and version, so we need to ## handle this correctly results = syn.tableQuery( 'select Born, COUNT(*) from %s group by Born order by Born' % table.schema.id, resultsAs="csv") assert results.includeRowIdAndRowVersion == False for i, row in enumerate(results): assert row[0] == [1917, 1926, 1929, 1930, 1931, 1935, 1936, 1938][i] assert row[1] == [1, 2, 2, 2, 2, 1, 1, 1][i] try: import pandas as pd results = syn.tableQuery("select * from %s where Born=1930" % table.schema.id, resultsAs="csv") df = results.asDataFrame() all(df['Born'].values == 1930) all(df['Hipness'].values == 8.5) ## Update via a Data Frame df['Hipness'] = 9.75 table = syn.store(Table(table.tableId, df, etag=results.etag)) results = syn.tableQuery("select * from %s where Born=1930" % table.tableId, resultsAs="csv") for row in results: assert row[4] == 9.75 except ImportError as e1: sys.stderr.write( 'Pandas is apparently not installed, skipping part of test_tables_csv.\n\n' ) ## check what happens when query result is empty results = syn.tableQuery('select * from %s where Born=2013' % table.tableId, resultsAs="csv") assert len(list(results)) == 0 try: import pandas as pd results = syn.tableQuery('select * from %s where Born=2013' % table.tableId, resultsAs="csv") df = results.asDataFrame() assert df.shape[0] == 0 except ImportError as e1: sys.stderr.write( 'Pandas is apparently not installed, skipping part of test_tables_csv.\n\n' ) ## delete some rows results = syn.tableQuery('select * from %s where Hipness < 7' % table.tableId, resultsAs="csv") syn.delete(results)
def test_csv_table(): # Maybe not truly a unit test, but here because it doesn't do # network IO to synapse data = [["1", "1", "John Coltrane", 1926, 8.65, False], ["2", "1", "Miles Davis", 1926, 9.87, False], ["3", "1", "Bill Evans", 1929, 7.65, False], ["4", "1", "Paul Chambers", 1935, 5.14, False], ["5", "1", "Jimmy Cobb", 1929, 5.78, True], ["6", "1", "Scott LaFaro", 1936, 4.21, False], ["7", "1", "Sonny Rollins", 1930, 8.99, True], ["8", "1", "Kenny Burrel", 1931, 4.37, True]] filename = None cols = [Column(id='1', name='Name', columnType='STRING'), Column(id='2', name='Born', columnType='INTEGER'), Column(id='3', name='Hipness', columnType='DOUBLE'), Column(id='4', name='Living', columnType='BOOLEAN')] schema1 = Schema(id='syn1234', name='Jazz Guys', columns=cols, parent="syn1000001") # TODO: use StringIO.StringIO(data) rather than writing files try: # create CSV file with tempfile.NamedTemporaryFile(delete=False) as temp: filename = temp.name with io.open(filename, mode='w', encoding="utf-8", newline='') as temp: writer = csv.writer(temp, quoting=csv.QUOTE_NONNUMERIC, lineterminator=str(os.linesep)) headers = ['ROW_ID', 'ROW_VERSION'] + [col.name for col in cols] writer.writerow(headers) for row in data: writer.writerow(row) table = Table(schema1, filename) assert_is_instance(table, CsvFileTable) # need to set column headers to read a CSV file table.setColumnHeaders( [SelectColumn(name="ROW_ID", columnType="STRING"), SelectColumn(name="ROW_VERSION", columnType="STRING")] + [SelectColumn.from_column(col) for col in cols]) # test iterator for table_row, expected_row in zip(table, data): assert_equals(table_row, expected_row) # test asRowSet rowset = table.asRowSet() for rowset_row, expected_row in zip(rowset.rows, data): assert_equals(rowset_row['values'], expected_row[2:]) assert_equals(rowset_row['rowId'], expected_row[0]) assert_equals(rowset_row['versionNumber'], expected_row[1]) df = table.asDataFrame() assert_equals(list(df['Name']), [row[2] for row in data]) assert_equals(list(df['Born']), [row[3] for row in data]) assert_equals(list(df['Living']), [row[5] for row in data]) assert_equals(list(df.index), ['%s_%s' % tuple(row[0:2]) for row in data]) assert_equals(df.shape, (8, 4)) except Exception: if filename: try: if os.path.isdir(filename): shutil.rmtree(filename) else: os.remove(filename) except Exception as ex: print(ex) raise
def test_RowSetTable(): row_set_json = { 'etag': 'aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee', 'headers': [{ 'columnType': 'STRING', 'id': '353', 'name': 'name' }, { 'columnType': 'DOUBLE', 'id': '355', 'name': 'x' }, { 'columnType': 'DOUBLE', 'id': '3020', 'name': 'y' }, { 'columnType': 'INTEGER', 'id': '891', 'name': 'n' }], 'rows': [{ 'rowId': 5, 'values': ['foo', '1.23', '2.2', '101'], 'versionNumber': 3 }, { 'rowId': 6, 'values': ['bar', '1.34', '2.4', '101'], 'versionNumber': 3 }, { 'rowId': 7, 'values': ['foo', '1.23', '2.2', '101'], 'versionNumber': 4 }, { 'rowId': 8, 'values': ['qux', '1.23', '2.2', '102'], 'versionNumber': 3 }], 'tableId': 'syn2976298' } row_set = RowSet.from_json(row_set_json) assert row_set.etag == 'aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee' assert row_set.tableId == 'syn2976298' assert len(row_set.headers) == 4 assert len(row_set.rows) == 4 schema = Schema(id="syn2976298", name="Bogus Schema", columns=[353, 355, 3020, 891], parent="syn1000001") table = Table(schema, row_set) assert table.etag == 'aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee' assert table.tableId == 'syn2976298' assert len(table.headers) == 4 assert len(table.asRowSet().rows) == 4 try: import pandas as pd df = table.asDataFrame() assert df.shape == (4, 4) assert all(df['name'] == ['foo', 'bar', 'foo', 'qux']) except ImportError as e1: sys.stderr.write( 'Pandas is apparently not installed, skipping part of test_RowSetTable.\n\n' )
def concatenate_tables_to_synapse_table(frames, synapse_project_id, table_name, username='', password=''): """ Concatenate multiple dataframes and store as a Synapse table. Reuse the indices from the original DataFrame, increasing number of columns. Parameters ---------- frames : list of pandas DataFrames paths to files to upload to Synapse synapse_project_id : string Synapse ID for project to which table is to be written table_name : string schema name of table username : string Synapse username (only needed once on a given machine) password : string Synapse password (only needed once on a given machine) Returns ------- table_data : Pandas DataFrame output table synapse_project_id : string Synapse ID for project Examples -------- >>> import pandas as pd >>> from mhealthx.io_data import concatenate_tables_to_synapse_table >>> df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'], >>> 'B': ['B0', 'B1', 'B2', 'B3'], >>> 'C': ['C0', 'C1', 'C2', 'C3'], >>> 'D': ['D0', 'D1', 'D2', 'D3']}, >>> index=[0, 1, 2, 3]) >>> df2 = pd.DataFrame({'E': ['A4', 'A5', 'A6', 'A7'], >>> 'F': ['B4', 'B5', 'B6', 'B7'], >>> 'G': ['C4', 'C5', 'C6', 'C7'], >>> 'H': ['D4', 'D5', 'D6', 'D7']}, >>> index=[0, 1, 2, 3]) >>> frames = [df1, df2] >>> synapse_project_id = 'syn4899451' >>> table_name = 'Test to join tables' >>> username = '' >>> password = '' >>> table_data, synapse_project_id = concatenate_tables_to_synapse_table(frames, synapse_project_id, table_name, username, password) """ import pandas as pd import synapseclient from synapseclient import Schema, Table, as_table_columns syn = synapseclient.Synapse() # Log in to Synapse: if username and password: syn.login(username, password) else: syn.login() # Concatenate dataframes: reuse the indices from the original DataFrame, # increasing number of columns: table_data = pd.concat(frames, axis=1) #, join_axes=[frames[0].index]) # Create table schema: schema = Schema(name=table_name, columns=as_table_columns(table_data), parent=synapse_project_id) # Store as Synapse table: table = syn.store(Table(schema, table_data)) return table_data, synapse_project_id
def copy_synapse_table(synapse_table_id, synapse_project_id, table_name='', remove_columns=[], username='', password=''): """ Copy Synapse table to another Synapse project. Parameters ---------- synapse_table_id : string Synapse ID for table to copy synapse_project_id : string copy table to project with this Synapse ID table_name : string schema name of table remove_columns : list of strings column headers for columns to be removed username : string Synapse username (only needed once on a given machine) password : string Synapse password (only needed once on a given machine) Returns ------- table_data : Pandas DataFrame Synapse table contents table_name : string schema name of table synapse_project_id : string Synapse ID for project within which table is to be written Examples -------- >>> from mhealthx.io_data import copy_synapse_table >>> synapse_table_id = 'syn4590865' >>> synapse_project_id = 'syn4899451' >>> table_name = 'Copy of ' + synapse_table_id >>> remove_columns = ['audio_audio.m4a', 'audio_countdown.m4a'] >>> username = '' >>> password = '' >>> table_data, table_name, synapse_project_id = copy_synapse_table(synapse_table_id, synapse_project_id, table_name, remove_columns, username, password) """ import synapseclient from synapseclient import Schema from synapseclient.table import Table, as_table_columns syn = synapseclient.Synapse() # Log in to Synapse: if username and password: syn.login(username, password) else: syn.login() # Download Synapse table as a dataframe: results = syn.tableQuery("select * from {0}".format(synapse_table_id)) table_data = results.asDataFrame() # Remove specified columns: if remove_columns: for remove_column in remove_columns: del table_data[remove_column] # Upload to Synapse table: table_data.index = range(table_data.shape[0]) schema = Schema(name=table_name, columns=as_table_columns(table_data), parent=synapse_project_id, includeRowIdAndRowVersion=False) table = syn.store(Table(schema, table_data)) return table_data, table_name, synapse_project_id
def test_csv_table(): ## Maybe not truly a unit test, but here because it doesn't do ## network IO to synapse data = [["1", "1", "John Coltrane", 1926, 8.65, False], ["2", "1", "Miles Davis", 1926, 9.87, False], ["3", "1", "Bill Evans", 1929, 7.65, False], ["4", "1", "Paul Chambers", 1935, 5.14, False], ["5", "1", "Jimmy Cobb", 1929, 5.78, True], ["6", "1", "Scott LaFaro", 1936, 4.21, False], ["7", "1", "Sonny Rollins", 1930, 8.99, True], ["8", "1", "Kenny Burrel", 1931, 4.37, True]] filename = None cols = [] cols.append(Column(id='1', name='Name', columnType='STRING')) cols.append(Column(id='2', name='Born', columnType='INTEGER')) cols.append(Column(id='3', name='Hipness', columnType='DOUBLE')) cols.append(Column(id='4', name='Living', columnType='BOOLEAN')) schema1 = Schema(id='syn1234', name='Jazz Guys', columns=cols, parent="syn1000001") #TODO: use StringIO.StringIO(data) rather than writing files try: ## create CSV file with tempfile.NamedTemporaryFile(delete=False) as temp: filename = temp.name with io.open(filename, mode='w', encoding="utf-8", newline='') as temp: writer = csv.writer(temp, quoting=csv.QUOTE_NONNUMERIC, lineterminator=str(os.linesep)) headers = ['ROW_ID', 'ROW_VERSION'] + [col.name for col in cols] writer.writerow(headers) for row in data: writer.writerow(row) table = Table(schema1, filename) assert isinstance(table, CsvFileTable) ## need to set column headers to read a CSV file table.setColumnHeaders([ SelectColumn(name="ROW_ID", columnType="STRING"), SelectColumn(name="ROW_VERSION", columnType="STRING") ] + [SelectColumn.from_column(col) for col in cols]) ## test iterator for table_row, expected_row in zip(table, data): assert table_row == expected_row ## test asRowSet rowset = table.asRowSet() for rowset_row, expected_row in zip(rowset.rows, data): assert rowset_row['values'] == expected_row[2:] assert rowset_row['rowId'] == expected_row[0] assert rowset_row['versionNumber'] == expected_row[1] ## test asDataFrame try: import pandas as pd df = table.asDataFrame() assert all(df['Name'] == [row[2] for row in data]) assert all(df['Born'] == [row[3] for row in data]) assert all(df['Living'] == [row[5] for row in data]) assert all(df.index == ['%s_%s' % tuple(row[0:2]) for row in data]) assert df.shape == (8, 4) except ImportError as e1: sys.stderr.write( 'Pandas is apparently not installed, skipping asDataFrame portion of test_csv_table.\n\n' ) except Exception as ex1: if filename: try: if os.path.isdir(filename): shutil.rmtree(filename) else: os.remove(filename) except Exception as ex: print(ex) raise
def feature_file_to_synapse_table(feature_file, raw_feature_file, source_file_id, provenance_activity_id, command, command_line, synapse_table_id, username='', password=''): """ Upload files and file handle IDs to Synapse. Parameters ---------- feature_file : string path to file to upload to Synapse raw_feature_file : string path to file to upload to Synapse source_file_id : string Synapse file handle ID to source file used to generate features provenance_activity_id : string Synapse provenance activity ID command : string name of command run to generate raw feature file command_line : string full command line run to generate raw feature file synapse_table_id : string Synapse table ID for table to store file handle IDs, etc. username : string Synapse username (only needed once on a given machine) password : string Synapse password (only needed once on a given machine) Examples -------- >>> from mhealthx.xtra import feature_file_to_synapse_table >>> feature_file = '/Users/arno/Local/wav/test1.wav' >>> raw_feature_file = '/Users/arno/Local/wav/test1.wav' >>> source_file_id = '' >>> provenance_activity_id = '' >>> command_line = 'SMILExtract -C blah -I blah -O blah' >>> synapse_table_id = 'syn4899451' >>> username = '' >>> password = '' >>> feature_file_to_synapse_table(feature_file, raw_feature_file, source_file_id, provenance_activity_id, command_line, synapse_table_id, username, password) """ import synapseclient from synapseclient.table import Table syn = synapseclient.Synapse() # Log in to Synapse: if username and password: syn.login(username, password) else: syn.login() # Store feature and raw feature files and get file handle IDs: file_handle = syn._chunkedUploadFile(feature_file) file_id = file_handle['id'] raw_file_handle = syn._chunkedUploadFile(raw_feature_file) raw_file_id = raw_file_handle['id'] # Add new row to Synapse table: new_rows = [[ file_id, raw_file_id, source_file_id, provenance_activity_id, command, command_line ]] schema = syn.get(synapse_table_id) table = syn.store(Table(schema, new_rows)) return synapse_table_id
def test_csv_table(): ## Maybe not truly a unit test, but here because it doesn't do ## network IO to synapse data = [["1", "1", "John Coltrane", 1926, 8.65, False], ["2", "1", "Miles Davis", 1926, 9.87, False], ["3", "1", "Bill Evans", 1929, 7.65, False], ["4", "1", "Paul Chambers", 1935, 5.14, False], ["5", "1", "Jimmy Cobb", 1929, 5.78, True], ["6", "1", "Scott LaFaro", 1936, 4.21, False], ["7", "1", "Sonny Rollins", 1930, 8.99, True], ["8", "1", "Kenny Burrel", 1931, 4.37, True]] filename = None cols = [] cols.append(Column(id='1', name='Name', columnType='STRING')) cols.append(Column(id='2', name='Born', columnType='INTEGER')) cols.append(Column(id='3', name='Hipness', columnType='DOUBLE')) cols.append(Column(id='4', name='Living', columnType='BOOLEAN')) schema1 = Schema(id='syn1234', name='Jazz Guys', columns=cols, parent="syn1000001") #TODO: use StringIO.StringIO(data) rather than writing files try: ## create CSV file with tempfile.NamedTemporaryFile(delete=False) as temp: writer = csv.writer(temp, quoting=csv.QUOTE_NONNUMERIC, lineterminator=os.linesep) writer.writerow(['ROW_ID', 'ROW_VERSION'] + [col.name for col in cols]) filename = temp.name for row in data: writer.writerow(row) table = Table(schema1, filename) assert isinstance(table, CsvFileTable) ## need to set column headers to read a CSV file table.setColumnHeaders( [SelectColumn(name="ROW_ID", columnType="STRING"), SelectColumn(name="ROW_VERSION", columnType="STRING")] + [SelectColumn.from_column(col) for col in cols]) ## test iterator # print "\n\nJazz Guys" for table_row, expected_row in izip(table, data): # print table_row, expected_row assert table_row==expected_row ## test asRowSet rowset = table.asRowSet() for rowset_row, expected_row in izip(rowset.rows, data): #print rowset_row, expected_row assert rowset_row['values']==expected_row[2:] assert rowset_row['rowId']==expected_row[0] assert rowset_row['versionNumber']==expected_row[1] ## test asDataFrame try: import pandas as pd df = table.asDataFrame() assert all(df['Name'] == [row[2] for row in data]) assert all(df['Born'] == [row[3] for row in data]) assert all(df['Living'] == [row[5] for row in data]) assert all(df.index == ['%s_%s'%tuple(row[0:2]) for row in data]) assert df.shape == (8,4) except ImportError as e1: sys.stderr.write('Pandas is apparently not installed, skipping asDataFrame portion of test_csv_table.\n\n') except Exception as ex1: if filename: try: if os.path.isdir(filename): shutil.rmtree(filename) else: os.remove(filename) except Exception as ex: print ex raise