def test_as_table_columns__with_pandas_DataFrame(): df = pd.DataFrame({ 'foobar': ("foo", "bar", "baz", "qux", "asdf"), 'x': tuple(math.pi*i for i in range(5)), 'n': (101, 202, 303, 404, 505), 'really': (False, True, False, True, False), 'size': ('small', 'large', 'medium', 'medium', 'large')}, columns=['foobar', 'x', 'n', 'really', 'size']) cols = as_table_columns(df) expected_columns = [ {'defaultValue': '', 'columnType': 'STRING', 'name': 'foobar', 'maximumSize': 30, 'concreteType': 'org.sagebionetworks.repo.model.table.ColumnModel'}, {'columnType': 'DOUBLE', 'name': 'x', u'concreteType': 'org.sagebionetworks.repo.model.table.ColumnModel'}, {'columnType': 'INTEGER', 'name': 'n', 'concreteType': 'org.sagebionetworks.repo.model.table.ColumnModel'}, {'columnType': 'BOOLEAN', 'name': 'really', 'concreteType': 'org.sagebionetworks.repo.model.table.ColumnModel'}, {'defaultValue': '', 'columnType': 'STRING', 'name': 'size', 'maximumSize': 30, 'concreteType': 'org.sagebionetworks.repo.model.table.ColumnModel'} ] assert expected_columns == cols
def test_as_table_columns__with_pandas_DataFrame(): df = pd.DataFrame({ 'foobar': ("foo", "bar", "baz", "qux", "asdf"), 'x': tuple(math.pi*i for i in range(5)), 'n': (101, 202, 303, 404, 505), 'really': (False, True, False, True, False), 'size': ('small', 'large', 'medium', 'medium', 'large')}, columns=['foobar', 'x', 'n', 'really', 'size']) cols = as_table_columns(df) expected_columns = [ {'defaultValue': '', 'columnType': 'STRING', 'name': 'foobar', 'maximumSize': 30, 'concreteType': 'org.sagebionetworks.repo.model.table.ColumnModel'}, {'columnType': 'DOUBLE', 'name': 'x', u'concreteType': 'org.sagebionetworks.repo.model.table.ColumnModel'}, {'columnType': 'INTEGER', 'name': 'n', 'concreteType': 'org.sagebionetworks.repo.model.table.ColumnModel'}, {'columnType': 'BOOLEAN', 'name': 'really', 'concreteType': 'org.sagebionetworks.repo.model.table.ColumnModel'}, {'defaultValue': '', 'columnType': 'STRING', 'name': 'size', 'maximumSize': 30, 'concreteType': 'org.sagebionetworks.repo.model.table.ColumnModel'} ] assert_equals(expected_columns, cols)
def test_tables_pandas(): try: ## check if we have pandas import pandas as pd ## create a pandas DataFrame df = pd.DataFrame({ 'A' : ("foo", "bar", "baz", "qux", "asdf"), 'B' : tuple(math.pi*i for i in range(5)), 'C' : (101, 202, 303, 404, 505), 'D' : (False, True, False, True, False)}) cols = as_table_columns(df) cols[0].maximumSize = 20 schema = Schema(name="Nifty Table", columns=cols, parent=project) ## store in Synapse table = syn.store(Table(schema, df)) ## retrieve the table and verify results = syn.tableQuery('select * from %s'%table.schema.id) df2 = results.asDataFrame() ## simulate rowId-version rownames for comparison df.index = ['%s_0'%i for i in range(5)] assert all(df2 == df) except ImportError as e1: sys.stderr.write('Pandas is apparently not installed, skipping test_tables_pandas.\n\n')
def test_as_table_columns(): try: import pandas as pd df = pd.DataFrame({ 'foobar': ("foo", "bar", "baz", "qux", "asdf"), 'x': tuple(math.pi * i for i in range(5)), 'n': (101, 202, 303, 404, 505), 'really': (False, True, False, True, False), 'size': ('small', 'large', 'medium', 'medium', 'large') }) cols = as_table_columns(df) cols[0]['name'] == 'foobar' cols[0]['columnType'] == 'STRING' cols[1]['name'] == 'x' cols[1]['columnType'] == 'DOUBLE' cols[1]['name'] == 'n' cols[1]['columnType'] == 'INTEGER' cols[1]['name'] == 'really' cols[1]['columnType'] == 'BOOLEAN' cols[1]['name'] == 'size' # TODO: support Categorical when fully supported in Pandas Data Frames cols[1]['columnType'] == 'STRING' except ImportError as e1: sys.stderr.write( 'Pandas is apparently not installed, skipping test_as_table_columns.\n\n' )
def test_as_table_columns(): try: import pandas as pd df = pd.DataFrame({ 'foobar' : ("foo", "bar", "baz", "qux", "asdf"), 'x' : tuple(math.pi*i for i in range(5)), 'n' : (101, 202, 303, 404, 505), 'really' : (False, True, False, True, False), 'size' : ('small', 'large', 'medium', 'medium', 'large')}) cols = as_table_columns(df) cols[0]['name'] == 'foobar' cols[0]['columnType'] == 'STRING' cols[1]['name'] == 'x' cols[1]['columnType'] == 'DOUBLE' cols[1]['name'] == 'n' cols[1]['columnType'] == 'INTEGER' cols[1]['name'] == 'really' cols[1]['columnType'] == 'BOOLEAN' cols[1]['name'] == 'size' # TODO: support Categorical when fully supported in Pandas Data Frames cols[1]['columnType'] == 'STRING' except ImportError as e1: sys.stderr.write('Pandas is apparently not installed, skipping test_as_table_columns.\n\n')
def test_tables_pandas(): try: ## check if we have pandas import pandas as pd ## create a pandas DataFrame df = pd.DataFrame({ 'A': ("foo", "bar", "baz", "qux", "asdf"), 'B': tuple(math.pi * i for i in range(5)), 'C': (101, 202, 303, 404, 505), 'D': (False, True, False, True, False) }) cols = as_table_columns(df) cols[0].maximumSize = 20 schema = Schema(name="Nifty Table", columns=cols, parent=project) ## store in Synapse table = syn.store(Table(schema, df)) ## retrieve the table and verify results = syn.tableQuery('select * from %s' % table.schema.id) df2 = results.asDataFrame() ## simulate rowId-version rownames for comparison df.index = ['%s_0' % i for i in range(5)] assert all(df2 == df) except ImportError as e1: sys.stderr.write( 'Pandas is apparently not installed, skipping test_tables_pandas.\n\n' )
def test_pandas_to_table(): pd = _try_import_pandas('test_pandas_to_table') df = pd.DataFrame(dict(a=[1, 2, 3], b=["c", "d", "e"])) schema = Schema(name="Baz", parent="syn12345", columns=as_table_columns(df)) print("\n", df, "\n\n") ## A dataframe with no row id and version table = Table(schema, df) for i, row in enumerate(table): print(row) assert row[0] == (i + 1) assert row[1] == ["c", "d", "e"][i] assert len(table) == 3 ## If includeRowIdAndRowVersion=True, include empty row id an versions ## ROW_ID,ROW_VERSION,a,b ## ,,1,c ## ,,2,d ## ,,3,e table = Table(schema, df, includeRowIdAndRowVersion=True) for i, row in enumerate(table): print(row) assert row[0] is None assert row[1] is None assert row[2] == (i + 1) ## A dataframe with no row id and version df = pd.DataFrame(index=["1_7", "2_7", "3_8"], data=dict(a=[100, 200, 300], b=["c", "d", "e"])) print("\n", df, "\n\n") table = Table(schema, df) for i, row in enumerate(table): print(row) assert row[0] == ["1", "2", "3"][i] assert row[1] == ["7", "7", "8"][i] assert row[2] == (i + 1) * 100 assert row[3] == ["c", "d", "e"][i] ## A dataframe with row id and version in columns df = pd.DataFrame( dict(ROW_ID=["0", "1", "2"], ROW_VERSION=["8", "9", "9"], a=[100, 200, 300], b=["c", "d", "e"])) print("\n", df, "\n\n") table = Table(schema, df) for i, row in enumerate(table): print(row) assert row[0] == ["0", "1", "2"][i] assert row[1] == ["8", "9", "9"][i] assert row[2] == (i + 1) * 100 assert row[3] == ["c", "d", "e"][i]
def test_pandas_to_table(): try: import pandas as pd df = pd.DataFrame(dict(a=[1,2,3], b=["c", "d", "e"])) schema = Schema(name="Baz", parent="syn12345", columns=as_table_columns(df)) print("\n", df, "\n\n") ## A dataframe with no row id and version table = Table(schema, df) for i, row in enumerate(table): print(row) assert row[0]==(i+1) assert row[1]==["c", "d", "e"][i] assert len(table)==3 ## If includeRowIdAndRowVersion=True, include empty row id an versions ## ROW_ID,ROW_VERSION,a,b ## ,,1,c ## ,,2,d ## ,,3,e table = Table(schema, df, includeRowIdAndRowVersion=True) for i, row in enumerate(table): print(row) assert row[0] is None assert row[1] is None assert row[2]==(i+1) ## A dataframe with no row id and version df = pd.DataFrame(index=["1_7","2_7","3_8"], data=dict(a=[100,200,300], b=["c", "d", "e"])) print("\n", df, "\n\n") table = Table(schema, df) for i, row in enumerate(table): print(row) assert row[0]==["1","2","3"][i] assert row[1]==["7","7","8"][i] assert row[2]==(i+1)*100 assert row[3]==["c", "d", "e"][i] ## A dataframe with row id and version in columns df = pd.DataFrame(dict(ROW_ID=["0","1","2"], ROW_VERSION=["8","9","9"], a=[100,200,300], b=["c", "d", "e"])) print("\n", df, "\n\n") table = Table(schema, df) for i, row in enumerate(table): print(row) assert row[0]==["0","1","2"][i] assert row[1]==["8","9","9"][i] assert row[2]==(i+1)*100 assert row[3]==["c", "d", "e"][i] except ImportError as e1: sys.stderr.write('Pandas is apparently not installed, skipping test_pandas_to_table.\n\n')
def write_synapse_table(table_data, synapse_project_id, table_name='', username='', password=''): """ Write data to a Synapse table. Parameters ---------- table_data : Pandas DataFrame Synapse table contents synapse_project_id : string Synapse ID for project within which table is to be written table_name : string schema name of table username : string Synapse username (only needed once on a given machine) password : string Synapse password (only needed once on a given machine) Examples -------- >>> from mhealthx.io_data import read_synapse_table_files, write_synapse_table >>> in_synapse_table_id = 'syn4590865' >>> synapse_project_id = 'syn4899451' >>> column_names = [] >>> download_limit = None >>> out_path = '.' >>> username = '' >>> password = '' >>> table_data, files = read_synapse_table_files(in_synapse_table_id, column_names, download_limit, out_path, username, password) >>> table_name = 'Contents of ' + in_synapse_table_id >>> write_synapse_table(table_data, synapse_project_id, table_name, username, password) """ import synapseclient from synapseclient import Schema, Table, as_table_columns syn = synapseclient.Synapse() # Log in to Synapse: if username and password: syn.login(username, password) else: syn.login() table_data.index = range(table_data.shape[0]) schema = Schema(name=table_name, columns=as_table_columns(table_data), parent=synapse_project_id, includeRowIdAndRowVersion=False) syn.store(Table(schema, table_data))
def test_tables_pandas(): try: ## check if we have pandas import pandas as pd #import numpy for datatypes import numpy as np ## create a pandas DataFrame df = pd.DataFrame({ 'A': ("foo", "bar", "baz", "qux", "asdf"), 'B': tuple(0.42 * i for i in range(5)), 'C': (101, 202, 303, 404, 505), 'D': (False, True, False, True, False), # additional data types supported since SYNPY-347 'int64': tuple(np.int64(range(5))), 'datetime64': tuple( np.datetime64(d) for d in [ '2005-02-01', '2005-02-02', '2005-02-03', '2005-02-04', '2005-02-05' ]), 'string_': tuple( np.string_(s) for s in ['urgot', 'has', 'dark', 'mysterious', 'past']) }) cols = as_table_columns(df) cols[0].maximumSize = 20 schema = Schema(name="Nifty Table", columns=cols, parent=project) ## store in Synapse table = syn.store(Table(schema, df)) ## retrieve the table and verify results = syn.tableQuery('select * from %s' % table.schema.id, resultsAs='csv') df2 = results.asDataFrame(convert_to_datetime=True) ## simulate rowId-version rownames for comparison df.index = ['%s_0' % i for i in range(5)] #for python3 we need to convert from numpy.bytes_ to str or the equivalence comparision fails if six.PY3: df['string_'] = df['string_'].transform(str) # df2 == df gives Dataframe of boolean values; first .all() gives a Series object of ANDed booleans of each column; second .all() gives a bool that is ANDed value of that Series assert (df2 == df).all().all() except ImportError as e1: sys.stderr.write( 'Pandas is apparently not installed, skipping test_tables_pandas.\n\n' )
def test_iter_with_table_row_metadata(self): # csv file has row metadata, self.headers does not data = "ROW_ID,ROW_VERSION,col\n" \ "1,2,\"I like trains\"\n" \ "5,1,\"weeeeeeeeeeee\"\n" cols = as_table_columns(StringIOContextManager(data)) headers = [SelectColumn.from_column(col) for col in cols] with patch.object(io, "open", return_value=StringIOContextManager(data)): table = CsvFileTable("syn123", "/fake/file/path", headers=headers) expected_rows = [["I like trains"], ["weeeeeeeeeeee"]] for expected_row, table_row in zip(expected_rows, table): assert_equals(expected_row, table_row)
def test_iter_no_row_metadata(self): # both csv headers and self.headers do not contains row metadata data = "col1,col2\n" \ "1,2\n" \ "2,1\n" cols = as_table_columns(StringIOContextManager(data)) headers = [SelectColumn.from_column(col) for col in cols] with patch.object(io, "open", return_value=StringIOContextManager(data)): table = CsvFileTable("syn123", "/fake/file/path", headers=headers) expected_rows = [[1, 2], [2, 1]] for expected_row, table_row in zip(expected_rows, table): assert_equals(expected_row, table_row)
def test_iter_no_row_metadata(self): # both csv headers and self.headers do not contains row metadata data = "col1,col2\n" \ "1,2\n" \ "2,1\n" cols = as_table_columns(StringIOContextManager(data)) headers = [SelectColumn.from_column(col) for col in cols] with patch.object(io, "open", return_value=StringIOContextManager(data)): table = CsvFileTable("syn123", "/fake/file/path", headers=headers) expected_rows = [[1, 2], [2, 1]] for expected_row, table_row in zip(expected_rows, table): assert expected_row == table_row
def test_iter_with_table_row_metadata(self): # csv file has row metadata, self.headers does not data = "ROW_ID,ROW_VERSION,col\n" \ "1,2,\"I like trains\"\n" \ "5,1,\"weeeeeeeeeeee\"\n" cols = as_table_columns(StringIOContextManager(data)) headers = [SelectColumn.from_column(col) for col in cols] with patch.object(io, "open", return_value=StringIOContextManager(data)): table = CsvFileTable("syn123", "/fake/file/path", headers=headers) expected_rows = [["I like trains"], ["weeeeeeeeeeee"]] for expected_row, table_row in zip(expected_rows, table): assert expected_row == table_row
def test_dict_to_table(): d = dict(a=[1, 2, 3], b=["c", "d", "e"]) df = pd.DataFrame(d) schema = Schema(name="Baz", parent="syn12345", columns=as_table_columns(df)) with patch.object(CsvFileTable, "from_data_frame") as mocked_from_data_frame: Table(schema, d) # call_agrs is a tuple with values and name agrs_list = mocked_from_data_frame.call_args[0] # getting the second argument df_agr = agrs_list[1] assert df_agr.equals(df)
def test_iter_with_mismatch_row_metadata(self): # self.headers and csv file headers contains mismatch row metadata data = "ROW_ID,ROW_VERSION,ROW_ETAG,col\n" \ "1,2,etag1,\"I like trains\"\n" \ "5,1,etag2,\"weeeeeeeeeeee\"\n" cols = as_table_columns(StringIOContextManager(data)) headers = [SelectColumn(name="ROW_ID", columnType="STRING"), SelectColumn(name="ROW_VERSION", columnType="STRING")] + \ [SelectColumn.from_column(col) for col in cols] with patch.object(io, "open", return_value=StringIOContextManager(data)): table = CsvFileTable("syn123", "/fake/file/path", headers=headers) iter = table.__iter__() pytest.raises(ValueError, next, iter)
def test_dict_to_table(): d = dict(a=[1, 2, 3], b=["c", "d", "e"]) df = pd.DataFrame(d) schema = Schema(name="Baz", parent="syn12345", columns=as_table_columns(df)) with patch.object(CsvFileTable, "from_data_frame") as mocked_from_data_frame: Table(schema, d) # call_agrs is a tuple with values and name agrs_list = mocked_from_data_frame.call_args[0] # getting the second argument df_agr = agrs_list[1] assert_true(df_agr.equals(df))
def test_iter_with_mismatch_row_metadata(self): # self.headers and csv file headers contains mismatch row metadata data = "ROW_ID,ROW_VERSION,ROW_ETAG,col\n" \ "1,2,etag1,\"I like trains\"\n" \ "5,1,etag2,\"weeeeeeeeeeee\"\n" cols = as_table_columns(StringIOContextManager(data)) headers = [SelectColumn(name="ROW_ID", columnType="STRING"), SelectColumn(name="ROW_VERSION", columnType="STRING")] + \ [SelectColumn.from_column(col) for col in cols] with patch.object(io, "open", return_value=StringIOContextManager(data)): table = CsvFileTable("syn123", "/fake/file/path", headers=headers) iter = table.__iter__() assert_raises(ValueError, next, iter)
def test_iter_row_metadata_mismatch_in_headers(self): # csv file does not contain row metadata, self.headers does data = "col1,col2\n" \ "1,2\n" \ "2,1\n" cols = as_table_columns(StringIOContextManager(data)) headers = [SelectColumn(name="ROW_ID", columnType="STRING"), SelectColumn(name="ROW_VERSION", columnType="STRING")] + \ [SelectColumn.from_column(col) for col in cols] with patch.object(io, "open", return_value=StringIOContextManager(data)): table = CsvFileTable("syn123", "/fake/file/path", headers=headers) iter = table.__iter__() pytest.raises(ValueError, next, iter)
def test_iter_row_metadata_mismatch_in_headers(self): # csv file does not contain row metadata, self.headers does data = "col1,col2\n" \ "1,2\n" \ "2,1\n" cols = as_table_columns(StringIOContextManager(data)) headers = [SelectColumn(name="ROW_ID", columnType="STRING"), SelectColumn(name="ROW_VERSION", columnType="STRING")] + \ [SelectColumn.from_column(col) for col in cols] with patch.object(io, "open", return_value=StringIOContextManager(data)): table = CsvFileTable("syn123", "/fake/file/path", headers=headers) iter = table.__iter__() assert_raises(ValueError, next, iter)
def write_synapse_table(table_data, synapse_project_id, table_name='', username='', password=''): """ Write data to a Synapse table. Parameters ---------- table_data : Pandas DataFrame Synapse table contents synapse_project_id : string Synapse ID for project within which table is to be written table_name : string schema name of table username : string Synapse username (only needed once on a given machine) password : string Synapse password (only needed once on a given machine) Examples -------- >>> from mhealthx.xio import read_files_from_synapse_row >>> from mhealthx.xtra import write_synapse_table >>> synapse_table = 'syn4590865' >>> row = >>> column_name = '' >>> out_path = '.' >>> username = '' >>> password = '' >>> table_data, files = read_files_from_synapse_row(synapse_table, row, column_name, out_path, username, password) >>> synapse_project_id = 'syn4899451' >>> table_name = 'Contents of ' + synapse_table >>> write_synapse_table(table_data, synapse_project_id, table_name, username, password) """ import synapseclient from synapseclient import Schema, Table, as_table_columns syn = synapseclient.Synapse() # Log in to Synapse: if username and password: syn.login(username, password) else: syn.login() table_data.index = range(table_data.shape[0]) schema = Schema(name=table_name, columns=as_table_columns(table_data), parent=synapse_project_id, includeRowIdAndRowVersion=False) syn.store(Table(schema, table_data))
def test_tables_pandas(): # create a pandas DataFrame df = pd.DataFrame({ 'A': ("foo", "bar", "baz", "qux", "asdf"), 'B': tuple(0.42 * i for i in range(5)), 'C': (101, 202, 303, 404, 505), 'D': (False, True, False, True, False), # additional data types supported since SYNPY-347 'int64': tuple(np.int64(range(5))), 'datetime64': tuple( np.datetime64(d) for d in [ '2005-02-01', '2005-02-02', '2005-02-03', '2005-02-04', '2005-02-05' ]), 'string_': tuple( np.string_(s) for s in ['urgot', 'has', 'dark', 'mysterious', 'past']) }) cols = as_table_columns(df) cols[0].maximumSize = 20 schema = Schema(name="Nifty Table", columns=cols, parent=project) # store in Synapse table = syn.store(Table(schema, df)) # retrieve the table and verify results = syn.tableQuery('select * from %s' % table.schema.id, resultsAs='csv') df2 = results.asDataFrame(convert_to_datetime=True) # simulate rowId-version rownames for comparison df.index = ['%s_0' % i for i in range(5)] # for python3 we need to convert from numpy.bytes_ to str or the equivalence comparision fails if six.PY3: df['string_'] = df['string_'].transform(str) # SYNPY-717 df['datetime64'] = df['datetime64'].apply( lambda x: pd.Timestamp(x).tz_localize('UTC')) # df2 == df gives Dataframe of boolean values; first .all() gives a Series object of ANDed booleans of each column; # second .all() gives a bool that is ANDed value of that Series assert_frame_equal(df2, df)
def test_pandas_to_table(): df = pd.DataFrame(dict(a=[1, 2, 3], b=["c", "d", "e"])) schema = Schema(name="Baz", parent="syn12345", columns=as_table_columns(df)) # A dataframe with no row id and version table = Table(schema, df) for i, row in enumerate(table): assert_equals(row[0], (i + 1)) assert_equals(row[1], ["c", "d", "e"][i]) assert_equals(len(table), 3) # If includeRowIdAndRowVersion=True, include empty row id an versions # ROW_ID,ROW_VERSION,a,b # ,,1,c # ,,2,d # ,,3,e table = Table(schema, df, includeRowIdAndRowVersion=True) for i, row in enumerate(table): assert_is_none(row[0]) assert_is_none(row[1]) assert_equals(row[2], (i + 1)) # A dataframe with no row id and version df = pd.DataFrame(index=["1_7", "2_7", "3_8"], data=dict(a=[100, 200, 300], b=["c", "d", "e"])) table = Table(schema, df) for i, row in enumerate(table): assert_equals(row[0], ["1", "2", "3"][i]) assert_equals(row[1], ["7", "7", "8"][i]) assert_equals(row[2], (i + 1) * 100) assert_equals(row[3], ["c", "d", "e"][i]) # A dataframe with row id and version in columns df = pd.DataFrame( dict(ROW_ID=["0", "1", "2"], ROW_VERSION=["8", "9", "9"], a=[100, 200, 300], b=["c", "d", "e"])) table = Table(schema, df) for i, row in enumerate(table): assert_equals(row[0], ["0", "1", "2"][i]) assert_equals(row[1], ["8", "9", "9"][i]) assert_equals(row[2], (i + 1) * 100) assert_equals(row[3], ["c", "d", "e"][i])
def test_as_table_columns__with_csv_file(): string_io = StringIOContextManager( 'ROW_ID,ROW_VERSION,Name,Born,Hipness,Living\n' '"1", "1", "John Coltrane", 1926, 8.65, False\n' '"2", "1", "Miles Davis", 1926, 9.87, False') cols = as_table_columns(string_io) assert_equals(cols[0]['name'], 'Name') assert_equals(cols[0]['columnType'], 'STRING') assert_equals(cols[1]['name'], 'Born') assert_equals(cols[1]['columnType'], 'INTEGER') assert_equals(cols[2]['name'], 'Hipness') assert_equals(cols[2]['columnType'], 'DOUBLE') assert_equals(cols[3]['name'], 'Living') assert_equals(cols[3]['columnType'], 'STRING')
def test_iter_with_file_view_row_metadata(self): # csv file and self.headers contain matching row metadata data = "ROW_ID,ROW_VERSION,ROW_ETAG,col\n" \ "1,2,etag1,\"I like trains\"\n" \ "5,1,etag2,\"weeeeeeeeeeee\"\n" cols = as_table_columns(StringIOContextManager(data)) headers = [SelectColumn(name="ROW_ID", columnType="STRING"), SelectColumn(name="ROW_VERSION", columnType="STRING"), SelectColumn(name="ROW_ETAG", columnType="STRING")] + \ [SelectColumn.from_column(col) for col in cols] with patch.object(io, "open", return_value=StringIOContextManager(data)): table = CsvFileTable("syn123", "/fake/file/path", headers=headers) expected_rows = [['1', '2', "etag1", "I like trains"], ['5', '1', "etag2", "weeeeeeeeeeee"]] for expected_row, table_row in zip(expected_rows, table): assert expected_row == table_row
def test_iter_with_file_view_row_metadata(self): # csv file and self.headers contain matching row metadata data = "ROW_ID,ROW_VERSION,ROW_ETAG,col\n" \ "1,2,etag1,\"I like trains\"\n" \ "5,1,etag2,\"weeeeeeeeeeee\"\n" cols = as_table_columns(StringIOContextManager(data)) headers = [SelectColumn(name="ROW_ID", columnType="STRING"), SelectColumn(name="ROW_VERSION", columnType="STRING"), SelectColumn(name="ROW_ETAG", columnType="STRING")] + \ [SelectColumn.from_column(col) for col in cols] with patch.object(io, "open", return_value=StringIOContextManager(data)): table = CsvFileTable("syn123", "/fake/file/path", headers=headers) expected_rows = [['1', '2', "etag1", "I like trains"], ['5', '1', "etag2", "weeeeeeeeeeee"]] for expected_row, table_row in zip(expected_rows, table): assert_equals(expected_row, table_row)
def test_as_table_columns__with_csv_file(): string_io = StringIOContextManager( 'ROW_ID,ROW_VERSION,Name,Born,Hipness,Living\n' '"1", "1", "John Coltrane", 1926, 8.65, False\n' '"2", "1", "Miles Davis", 1926, 9.87, False' ) cols = as_table_columns(string_io) assert_equals(cols[0]['name'], 'Name') assert_equals(cols[0]['columnType'], 'STRING') assert_equals(cols[1]['name'], 'Born') assert_equals(cols[1]['columnType'], 'INTEGER') assert_equals(cols[2]['name'], 'Hipness') assert_equals(cols[2]['columnType'], 'DOUBLE') assert_equals(cols[3]['name'], 'Living') assert_equals(cols[3]['columnType'], 'STRING')
def test_pandas_to_table(): df = pd.DataFrame(dict(a=[1, 2, 3], b=["c", "d", "e"])) schema = Schema(name="Baz", parent="syn12345", columns=as_table_columns(df)) # A dataframe with no row id and version table = Table(schema, df) for i, row in enumerate(table): assert_equals(row[0], (i+1)) assert_equals(row[1], ["c", "d", "e"][i]) assert_equals(len(table), 3) # If includeRowIdAndRowVersion=True, include empty row id an versions # ROW_ID,ROW_VERSION,a,b # ,,1,c # ,,2,d # ,,3,e table = Table(schema, df, includeRowIdAndRowVersion=True) for i, row in enumerate(table): assert_is_none(row[0]) assert_is_none(row[1]) assert_equals(row[2], (i+1)) # A dataframe with no row id and version df = pd.DataFrame(index=["1_7", "2_7", "3_8"], data=dict(a=[100, 200, 300], b=["c", "d", "e"])) table = Table(schema, df) for i, row in enumerate(table): assert_equals(row[0], ["1", "2", "3"][i]) assert_equals(row[1], ["7", "7", "8"][i]) assert_equals(row[2], (i+1)*100) assert_equals(row[3], ["c", "d", "e"][i]) # A dataframe with row id and version in columns df = pd.DataFrame(dict(ROW_ID=["0", "1", "2"], ROW_VERSION=["8", "9", "9"], a=[100, 200, 300], b=["c", "d", "e"])) table = Table(schema, df) for i, row in enumerate(table): assert_equals(row[0], ["0", "1", "2"][i]) assert_equals(row[1], ["8", "9", "9"][i]) assert_equals(row[2], (i+1)*100) assert_equals(row[3], ["c", "d", "e"][i])
def concatenate_tables_to_synapse_table(frames, synapse_project_id, table_name, username='', password=''): """ Concatenate multiple dataframes and store as a Synapse table. Reuse the indices from the original DataFrame, increasing number of columns. Parameters ---------- frames : list of pandas DataFrames paths to files to upload to Synapse synapse_project_id : string Synapse ID for project to which table is to be written table_name : string schema name of table username : string Synapse username (only needed once on a given machine) password : string Synapse password (only needed once on a given machine) Returns ------- table_data : Pandas DataFrame output table synapse_project_id : string Synapse ID for project Examples -------- >>> import pandas as pd >>> from mhealthx.io_data import concatenate_tables_to_synapse_table >>> df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'], >>> 'B': ['B0', 'B1', 'B2', 'B3'], >>> 'C': ['C0', 'C1', 'C2', 'C3'], >>> 'D': ['D0', 'D1', 'D2', 'D3']}, >>> index=[0, 1, 2, 3]) >>> df2 = pd.DataFrame({'E': ['A4', 'A5', 'A6', 'A7'], >>> 'F': ['B4', 'B5', 'B6', 'B7'], >>> 'G': ['C4', 'C5', 'C6', 'C7'], >>> 'H': ['D4', 'D5', 'D6', 'D7']}, >>> index=[0, 1, 2, 3]) >>> frames = [df1, df2] >>> synapse_project_id = 'syn4899451' >>> table_name = 'Test to join tables' >>> username = '' >>> password = '' >>> table_data, synapse_project_id = concatenate_tables_to_synapse_table(frames, synapse_project_id, table_name, username, password) """ import pandas as pd import synapseclient from synapseclient import Schema, Table, as_table_columns syn = synapseclient.Synapse() # Log in to Synapse: if username and password: syn.login(username, password) else: syn.login() # Concatenate dataframes: reuse the indices from the original DataFrame, # increasing number of columns: table_data = pd.concat(frames, axis=1) #, join_axes=[frames[0].index]) # Create table schema: schema = Schema(name=table_name, columns=as_table_columns(table_data), parent=synapse_project_id) # Store as Synapse table: table = syn.store(Table(schema, table_data)) return table_data, synapse_project_id
def copy_synapse_table(synapse_table_id, synapse_project_id, table_name='', remove_columns=[], username='', password=''): """ Copy Synapse table to another Synapse project. Parameters ---------- synapse_table_id : string Synapse ID for table to copy synapse_project_id : string copy table to project with this Synapse ID table_name : string schema name of table remove_columns : list of strings column headers for columns to be removed username : string Synapse username (only needed once on a given machine) password : string Synapse password (only needed once on a given machine) Returns ------- table_data : Pandas DataFrame Synapse table contents table_name : string schema name of table synapse_project_id : string Synapse ID for project within which table is to be written Examples -------- >>> from mhealthx.io_data import copy_synapse_table >>> synapse_table_id = 'syn4590865' >>> synapse_project_id = 'syn4899451' >>> table_name = 'Copy of ' + synapse_table_id >>> remove_columns = ['audio_audio.m4a', 'audio_countdown.m4a'] >>> username = '' >>> password = '' >>> table_data, table_name, synapse_project_id = copy_synapse_table(synapse_table_id, synapse_project_id, table_name, remove_columns, username, password) """ import synapseclient from synapseclient import Schema from synapseclient.table import Table, as_table_columns syn = synapseclient.Synapse() # Log in to Synapse: if username and password: syn.login(username, password) else: syn.login() # Download Synapse table as a dataframe: results = syn.tableQuery("select * from {0}".format(synapse_table_id)) table_data = results.asDataFrame() # Remove specified columns: if remove_columns: for remove_column in remove_columns: del table_data[remove_column] # Upload to Synapse table: table_data.index = range(table_data.shape[0]) schema = Schema(name=table_name, columns=as_table_columns(table_data), parent=synapse_project_id, includeRowIdAndRowVersion=False) table = syn.store(Table(schema, table_data)) return table_data, table_name, synapse_project_id