コード例 #1
0
def test_as_table_columns__with_pandas_DataFrame():
    df = pd.DataFrame({
        'foobar': ("foo", "bar", "baz", "qux", "asdf"),
        'x': tuple(math.pi*i for i in range(5)),
        'n': (101, 202, 303, 404, 505),
        'really': (False, True, False, True, False),
        'size': ('small', 'large', 'medium', 'medium', 'large')},
        columns=['foobar', 'x', 'n', 'really', 'size'])

    cols = as_table_columns(df)

    expected_columns = [
        {'defaultValue': '',
         'columnType': 'STRING',
         'name': 'foobar',
         'maximumSize': 30,
         'concreteType': 'org.sagebionetworks.repo.model.table.ColumnModel'},
        {'columnType': 'DOUBLE',
         'name': 'x',
         u'concreteType': 'org.sagebionetworks.repo.model.table.ColumnModel'},
        {'columnType': 'INTEGER',
         'name': 'n',
         'concreteType': 'org.sagebionetworks.repo.model.table.ColumnModel'},
        {'columnType': 'BOOLEAN',
         'name': 'really',
         'concreteType': 'org.sagebionetworks.repo.model.table.ColumnModel'},
        {'defaultValue': '',
         'columnType': 'STRING',
         'name': 'size',
         'maximumSize': 30,
         'concreteType': 'org.sagebionetworks.repo.model.table.ColumnModel'}
    ]
    assert expected_columns == cols
コード例 #2
0
def test_as_table_columns__with_pandas_DataFrame():
    df = pd.DataFrame({
        'foobar': ("foo", "bar", "baz", "qux", "asdf"),
        'x': tuple(math.pi*i for i in range(5)),
        'n': (101, 202, 303, 404, 505),
        'really': (False, True, False, True, False),
        'size': ('small', 'large', 'medium', 'medium', 'large')},
    columns=['foobar', 'x', 'n', 'really', 'size'])

    cols = as_table_columns(df)

    expected_columns = [
        {'defaultValue': '',
         'columnType': 'STRING',
         'name': 'foobar',
         'maximumSize': 30,
         'concreteType': 'org.sagebionetworks.repo.model.table.ColumnModel'},
        {'columnType': 'DOUBLE',
         'name': 'x',
         u'concreteType': 'org.sagebionetworks.repo.model.table.ColumnModel'},
        {'columnType': 'INTEGER',
         'name': 'n',
         'concreteType': 'org.sagebionetworks.repo.model.table.ColumnModel'},
        {'columnType': 'BOOLEAN',
         'name': 'really',
         'concreteType': 'org.sagebionetworks.repo.model.table.ColumnModel'},
        {'defaultValue': '',
         'columnType': 'STRING',
         'name': 'size',
         'maximumSize': 30,
         'concreteType': 'org.sagebionetworks.repo.model.table.ColumnModel'}
    ]
    assert_equals(expected_columns, cols)
コード例 #3
0
def test_tables_pandas():
    try:
        ## check if we have pandas
        import pandas as pd

        ## create a pandas DataFrame
        df = pd.DataFrame({
            'A' : ("foo", "bar", "baz", "qux", "asdf"),
            'B' : tuple(math.pi*i for i in range(5)),
            'C' : (101, 202, 303, 404, 505),
            'D' : (False, True, False, True, False)})

        cols = as_table_columns(df)
        cols[0].maximumSize = 20
        schema = Schema(name="Nifty Table", columns=cols, parent=project)

        ## store in Synapse
        table = syn.store(Table(schema, df))

        ## retrieve the table and verify
        results = syn.tableQuery('select * from %s'%table.schema.id)
        df2 = results.asDataFrame()

        ## simulate rowId-version rownames for comparison
        df.index = ['%s_0'%i for i in range(5)]
        assert all(df2 == df)

    except ImportError as e1:
        sys.stderr.write('Pandas is apparently not installed, skipping test_tables_pandas.\n\n')
コード例 #4
0
def test_as_table_columns():
    try:
        import pandas as pd

        df = pd.DataFrame({
            'foobar': ("foo", "bar", "baz", "qux", "asdf"),
            'x':
            tuple(math.pi * i for i in range(5)),
            'n': (101, 202, 303, 404, 505),
            'really': (False, True, False, True, False),
            'size': ('small', 'large', 'medium', 'medium', 'large')
        })

        cols = as_table_columns(df)

        cols[0]['name'] == 'foobar'
        cols[0]['columnType'] == 'STRING'
        cols[1]['name'] == 'x'
        cols[1]['columnType'] == 'DOUBLE'
        cols[1]['name'] == 'n'
        cols[1]['columnType'] == 'INTEGER'
        cols[1]['name'] == 'really'
        cols[1]['columnType'] == 'BOOLEAN'
        cols[1]['name'] == 'size'
        # TODO: support Categorical when fully supported in Pandas Data Frames
        cols[1]['columnType'] == 'STRING'

    except ImportError as e1:
        sys.stderr.write(
            'Pandas is apparently not installed, skipping test_as_table_columns.\n\n'
        )
コード例 #5
0
def test_as_table_columns():
    try:
        import pandas as pd

        df = pd.DataFrame({
            'foobar' : ("foo", "bar", "baz", "qux", "asdf"),
            'x' : tuple(math.pi*i for i in range(5)),
            'n' : (101, 202, 303, 404, 505),
            'really' : (False, True, False, True, False),
            'size' : ('small', 'large', 'medium', 'medium', 'large')})

        cols = as_table_columns(df)

        cols[0]['name'] == 'foobar'
        cols[0]['columnType'] == 'STRING'
        cols[1]['name'] == 'x'
        cols[1]['columnType'] == 'DOUBLE'
        cols[1]['name'] == 'n'
        cols[1]['columnType'] == 'INTEGER'
        cols[1]['name'] == 'really'
        cols[1]['columnType'] == 'BOOLEAN'
        cols[1]['name'] == 'size'
        # TODO: support Categorical when fully supported in Pandas Data Frames
        cols[1]['columnType'] == 'STRING'

    except ImportError as e1:
        sys.stderr.write('Pandas is apparently not installed, skipping test_as_table_columns.\n\n')
コード例 #6
0
def test_tables_pandas():
    try:
        ## check if we have pandas
        import pandas as pd

        ## create a pandas DataFrame
        df = pd.DataFrame({
            'A': ("foo", "bar", "baz", "qux", "asdf"),
            'B': tuple(math.pi * i for i in range(5)),
            'C': (101, 202, 303, 404, 505),
            'D': (False, True, False, True, False)
        })

        cols = as_table_columns(df)
        cols[0].maximumSize = 20
        schema = Schema(name="Nifty Table", columns=cols, parent=project)

        ## store in Synapse
        table = syn.store(Table(schema, df))

        ## retrieve the table and verify
        results = syn.tableQuery('select * from %s' % table.schema.id)
        df2 = results.asDataFrame()

        ## simulate rowId-version rownames for comparison
        df.index = ['%s_0' % i for i in range(5)]
        assert all(df2 == df)

    except ImportError as e1:
        sys.stderr.write(
            'Pandas is apparently not installed, skipping test_tables_pandas.\n\n'
        )
コード例 #7
0
def test_pandas_to_table():
    pd = _try_import_pandas('test_pandas_to_table')

    df = pd.DataFrame(dict(a=[1, 2, 3], b=["c", "d", "e"]))
    schema = Schema(name="Baz",
                    parent="syn12345",
                    columns=as_table_columns(df))
    print("\n", df, "\n\n")

    ## A dataframe with no row id and version
    table = Table(schema, df)

    for i, row in enumerate(table):
        print(row)
        assert row[0] == (i + 1)
        assert row[1] == ["c", "d", "e"][i]

    assert len(table) == 3

    ## If includeRowIdAndRowVersion=True, include empty row id an versions
    ## ROW_ID,ROW_VERSION,a,b
    ## ,,1,c
    ## ,,2,d
    ## ,,3,e
    table = Table(schema, df, includeRowIdAndRowVersion=True)
    for i, row in enumerate(table):
        print(row)
        assert row[0] is None
        assert row[1] is None
        assert row[2] == (i + 1)

    ## A dataframe with no row id and version
    df = pd.DataFrame(index=["1_7", "2_7", "3_8"],
                      data=dict(a=[100, 200, 300], b=["c", "d", "e"]))
    print("\n", df, "\n\n")

    table = Table(schema, df)
    for i, row in enumerate(table):
        print(row)
        assert row[0] == ["1", "2", "3"][i]
        assert row[1] == ["7", "7", "8"][i]
        assert row[2] == (i + 1) * 100
        assert row[3] == ["c", "d", "e"][i]

    ## A dataframe with row id and version in columns
    df = pd.DataFrame(
        dict(ROW_ID=["0", "1", "2"],
             ROW_VERSION=["8", "9", "9"],
             a=[100, 200, 300],
             b=["c", "d", "e"]))
    print("\n", df, "\n\n")

    table = Table(schema, df)
    for i, row in enumerate(table):
        print(row)
        assert row[0] == ["0", "1", "2"][i]
        assert row[1] == ["8", "9", "9"][i]
        assert row[2] == (i + 1) * 100
        assert row[3] == ["c", "d", "e"][i]
コード例 #8
0
def test_pandas_to_table():
    try:
        import pandas as pd

        df = pd.DataFrame(dict(a=[1,2,3], b=["c", "d", "e"]))
        schema = Schema(name="Baz", parent="syn12345", columns=as_table_columns(df))
        print("\n", df, "\n\n")

        ## A dataframe with no row id and version
        table = Table(schema, df)

        for i, row in enumerate(table):
            print(row)
            assert row[0]==(i+1)
            assert row[1]==["c", "d", "e"][i]

        assert len(table)==3

        ## If includeRowIdAndRowVersion=True, include empty row id an versions
        ## ROW_ID,ROW_VERSION,a,b
        ## ,,1,c
        ## ,,2,d
        ## ,,3,e
        table = Table(schema, df, includeRowIdAndRowVersion=True)
        for i, row in enumerate(table):
            print(row)
            assert row[0] is None
            assert row[1] is None
            assert row[2]==(i+1)

        ## A dataframe with no row id and version
        df = pd.DataFrame(index=["1_7","2_7","3_8"], data=dict(a=[100,200,300], b=["c", "d", "e"]))
        print("\n", df, "\n\n")

        table = Table(schema, df)
        for i, row in enumerate(table):
            print(row)
            assert row[0]==["1","2","3"][i]
            assert row[1]==["7","7","8"][i]
            assert row[2]==(i+1)*100
            assert row[3]==["c", "d", "e"][i]

        ## A dataframe with row id and version in columns
        df = pd.DataFrame(dict(ROW_ID=["0","1","2"], ROW_VERSION=["8","9","9"], a=[100,200,300], b=["c", "d", "e"]))
        print("\n", df, "\n\n")

        table = Table(schema, df)
        for i, row in enumerate(table):
            print(row)
            assert row[0]==["0","1","2"][i]
            assert row[1]==["8","9","9"][i]
            assert row[2]==(i+1)*100
            assert row[3]==["c", "d", "e"][i]

    except ImportError as e1:
        sys.stderr.write('Pandas is apparently not installed, skipping test_pandas_to_table.\n\n')
コード例 #9
0
ファイル: io_data.py プロジェクト: gciccarelli3/mhealthx
def write_synapse_table(table_data,
                        synapse_project_id,
                        table_name='',
                        username='',
                        password=''):
    """
    Write data to a Synapse table.

    Parameters
    ----------
    table_data : Pandas DataFrame
        Synapse table contents
    synapse_project_id : string
        Synapse ID for project within which table is to be written
    table_name : string
        schema name of table
    username : string
        Synapse username (only needed once on a given machine)
    password : string
        Synapse password (only needed once on a given machine)

    Examples
    --------
    >>> from mhealthx.io_data import read_synapse_table_files, write_synapse_table
    >>> in_synapse_table_id = 'syn4590865'
    >>> synapse_project_id = 'syn4899451'
    >>> column_names = []
    >>> download_limit = None
    >>> out_path = '.'
    >>> username = ''
    >>> password = ''
    >>> table_data, files = read_synapse_table_files(in_synapse_table_id, column_names, download_limit, out_path, username, password)
    >>> table_name = 'Contents of ' + in_synapse_table_id
    >>> write_synapse_table(table_data, synapse_project_id, table_name, username, password)

    """
    import synapseclient
    from synapseclient import Schema, Table, as_table_columns

    syn = synapseclient.Synapse()

    # Log in to Synapse:
    if username and password:
        syn.login(username, password)
    else:
        syn.login()

    table_data.index = range(table_data.shape[0])

    schema = Schema(name=table_name,
                    columns=as_table_columns(table_data),
                    parent=synapse_project_id,
                    includeRowIdAndRowVersion=False)

    syn.store(Table(schema, table_data))
コード例 #10
0
def test_tables_pandas():
    try:
        ## check if we have pandas
        import pandas as pd

        #import numpy for datatypes
        import numpy as np

        ## create a pandas DataFrame
        df = pd.DataFrame({
            'A': ("foo", "bar", "baz", "qux", "asdf"),
            'B':
            tuple(0.42 * i for i in range(5)),
            'C': (101, 202, 303, 404, 505),
            'D': (False, True, False, True, False),
            # additional data types supported since SYNPY-347
            'int64':
            tuple(np.int64(range(5))),
            'datetime64':
            tuple(
                np.datetime64(d) for d in [
                    '2005-02-01', '2005-02-02', '2005-02-03', '2005-02-04',
                    '2005-02-05'
                ]),
            'string_':
            tuple(
                np.string_(s)
                for s in ['urgot', 'has', 'dark', 'mysterious', 'past'])
        })

        cols = as_table_columns(df)
        cols[0].maximumSize = 20
        schema = Schema(name="Nifty Table", columns=cols, parent=project)

        ## store in Synapse
        table = syn.store(Table(schema, df))

        ## retrieve the table and verify
        results = syn.tableQuery('select * from %s' % table.schema.id,
                                 resultsAs='csv')
        df2 = results.asDataFrame(convert_to_datetime=True)

        ## simulate rowId-version rownames for comparison
        df.index = ['%s_0' % i for i in range(5)]

        #for python3 we need to convert from numpy.bytes_ to str or the equivalence comparision fails
        if six.PY3: df['string_'] = df['string_'].transform(str)

        # df2 == df gives Dataframe of boolean values; first .all() gives a Series object of ANDed booleans of each column; second .all() gives a bool that is ANDed value of that Series
        assert (df2 == df).all().all()

    except ImportError as e1:
        sys.stderr.write(
            'Pandas is apparently not installed, skipping test_tables_pandas.\n\n'
        )
コード例 #11
0
 def test_iter_with_table_row_metadata(self):
     # csv file has row metadata, self.headers does not
     data = "ROW_ID,ROW_VERSION,col\n" \
            "1,2,\"I like trains\"\n" \
            "5,1,\"weeeeeeeeeeee\"\n"
     cols = as_table_columns(StringIOContextManager(data))
     headers = [SelectColumn.from_column(col) for col in cols]
     with patch.object(io, "open", return_value=StringIOContextManager(data)):
         table = CsvFileTable("syn123", "/fake/file/path", headers=headers)
         expected_rows = [["I like trains"], ["weeeeeeeeeeee"]]
         for expected_row, table_row in zip(expected_rows, table):
             assert_equals(expected_row, table_row)
コード例 #12
0
 def test_iter_no_row_metadata(self):
     # both csv headers and self.headers do not contains row metadata
     data = "col1,col2\n" \
            "1,2\n" \
            "2,1\n"
     cols = as_table_columns(StringIOContextManager(data))
     headers = [SelectColumn.from_column(col) for col in cols]
     with patch.object(io, "open", return_value=StringIOContextManager(data)):
         table = CsvFileTable("syn123", "/fake/file/path", headers=headers)
         expected_rows = [[1, 2], [2, 1]]
         for expected_row, table_row in zip(expected_rows, table):
             assert_equals(expected_row, table_row)
コード例 #13
0
 def test_iter_no_row_metadata(self):
     # both csv headers and self.headers do not contains row metadata
     data = "col1,col2\n" \
            "1,2\n" \
            "2,1\n"
     cols = as_table_columns(StringIOContextManager(data))
     headers = [SelectColumn.from_column(col) for col in cols]
     with patch.object(io, "open", return_value=StringIOContextManager(data)):
         table = CsvFileTable("syn123", "/fake/file/path", headers=headers)
         expected_rows = [[1, 2], [2, 1]]
         for expected_row, table_row in zip(expected_rows, table):
             assert expected_row == table_row
コード例 #14
0
 def test_iter_with_table_row_metadata(self):
     # csv file has row metadata, self.headers does not
     data = "ROW_ID,ROW_VERSION,col\n" \
            "1,2,\"I like trains\"\n" \
            "5,1,\"weeeeeeeeeeee\"\n"
     cols = as_table_columns(StringIOContextManager(data))
     headers = [SelectColumn.from_column(col) for col in cols]
     with patch.object(io, "open", return_value=StringIOContextManager(data)):
         table = CsvFileTable("syn123", "/fake/file/path", headers=headers)
         expected_rows = [["I like trains"], ["weeeeeeeeeeee"]]
         for expected_row, table_row in zip(expected_rows, table):
             assert expected_row == table_row
コード例 #15
0
def test_dict_to_table():
    d = dict(a=[1, 2, 3], b=["c", "d", "e"])
    df = pd.DataFrame(d)
    schema = Schema(name="Baz", parent="syn12345", columns=as_table_columns(df))

    with patch.object(CsvFileTable, "from_data_frame") as mocked_from_data_frame:
        Table(schema, d)

    # call_agrs is a tuple with values and name
    agrs_list = mocked_from_data_frame.call_args[0]
    # getting the second argument
    df_agr = agrs_list[1]
    assert df_agr.equals(df)
コード例 #16
0
 def test_iter_with_mismatch_row_metadata(self):
     # self.headers and csv file headers contains mismatch row metadata
     data = "ROW_ID,ROW_VERSION,ROW_ETAG,col\n" \
            "1,2,etag1,\"I like trains\"\n" \
         "5,1,etag2,\"weeeeeeeeeeee\"\n"
     cols = as_table_columns(StringIOContextManager(data))
     headers = [SelectColumn(name="ROW_ID", columnType="STRING"),
                SelectColumn(name="ROW_VERSION", columnType="STRING")] + \
         [SelectColumn.from_column(col) for col in cols]
     with patch.object(io, "open", return_value=StringIOContextManager(data)):
         table = CsvFileTable("syn123", "/fake/file/path", headers=headers)
         iter = table.__iter__()
         pytest.raises(ValueError, next, iter)
コード例 #17
0
def test_dict_to_table():
    d = dict(a=[1, 2, 3], b=["c", "d", "e"])
    df = pd.DataFrame(d)
    schema = Schema(name="Baz", parent="syn12345", columns=as_table_columns(df))

    with patch.object(CsvFileTable, "from_data_frame") as mocked_from_data_frame:
        Table(schema, d)

    # call_agrs is a tuple with values and name
    agrs_list = mocked_from_data_frame.call_args[0]
    # getting the second argument
    df_agr = agrs_list[1]
    assert_true(df_agr.equals(df))
コード例 #18
0
 def test_iter_with_mismatch_row_metadata(self):
     # self.headers and csv file headers contains mismatch row metadata
     data = "ROW_ID,ROW_VERSION,ROW_ETAG,col\n" \
            "1,2,etag1,\"I like trains\"\n" \
             "5,1,etag2,\"weeeeeeeeeeee\"\n"
     cols = as_table_columns(StringIOContextManager(data))
     headers = [SelectColumn(name="ROW_ID", columnType="STRING"),
                SelectColumn(name="ROW_VERSION", columnType="STRING")] + \
               [SelectColumn.from_column(col) for col in cols]
     with patch.object(io, "open", return_value=StringIOContextManager(data)):
         table = CsvFileTable("syn123", "/fake/file/path", headers=headers)
         iter = table.__iter__()
         assert_raises(ValueError, next, iter)
コード例 #19
0
 def test_iter_row_metadata_mismatch_in_headers(self):
     # csv file does not contain row metadata, self.headers does
     data = "col1,col2\n" \
            "1,2\n" \
            "2,1\n"
     cols = as_table_columns(StringIOContextManager(data))
     headers = [SelectColumn(name="ROW_ID", columnType="STRING"),
                SelectColumn(name="ROW_VERSION", columnType="STRING")] + \
         [SelectColumn.from_column(col) for col in cols]
     with patch.object(io, "open", return_value=StringIOContextManager(data)):
         table = CsvFileTable("syn123", "/fake/file/path", headers=headers)
         iter = table.__iter__()
         pytest.raises(ValueError, next, iter)
コード例 #20
0
 def test_iter_row_metadata_mismatch_in_headers(self):
     # csv file does not contain row metadata, self.headers does
     data = "col1,col2\n" \
            "1,2\n" \
            "2,1\n"
     cols = as_table_columns(StringIOContextManager(data))
     headers = [SelectColumn(name="ROW_ID", columnType="STRING"),
                SelectColumn(name="ROW_VERSION", columnType="STRING")] + \
               [SelectColumn.from_column(col) for col in cols]
     with patch.object(io, "open", return_value=StringIOContextManager(data)):
         table = CsvFileTable("syn123", "/fake/file/path", headers=headers)
         iter = table.__iter__()
         assert_raises(ValueError, next, iter)
コード例 #21
0
ファイル: xtra.py プロジェクト: Sandy4321/mhealthx
def write_synapse_table(table_data, synapse_project_id, table_name='',
                        username='', password=''):
    """
    Write data to a Synapse table.

    Parameters
    ----------
    table_data : Pandas DataFrame
        Synapse table contents
    synapse_project_id : string
        Synapse ID for project within which table is to be written
    table_name : string
        schema name of table
    username : string
        Synapse username (only needed once on a given machine)
    password : string
        Synapse password (only needed once on a given machine)

    Examples
    --------
    >>> from mhealthx.xio import read_files_from_synapse_row
    >>> from mhealthx.xtra import write_synapse_table
    >>> synapse_table = 'syn4590865'
    >>> row =
    >>> column_name = ''
    >>> out_path = '.'
    >>> username = ''
    >>> password = ''
    >>> table_data, files = read_files_from_synapse_row(synapse_table, row, column_name, out_path, username, password)
    >>> synapse_project_id = 'syn4899451'
    >>> table_name = 'Contents of ' + synapse_table
    >>> write_synapse_table(table_data, synapse_project_id, table_name, username, password)

    """
    import synapseclient
    from synapseclient import Schema, Table, as_table_columns

    syn = synapseclient.Synapse()

    # Log in to Synapse:
    if username and password:
        syn.login(username, password)
    else:
        syn.login()

    table_data.index = range(table_data.shape[0])

    schema = Schema(name=table_name, columns=as_table_columns(table_data),
                    parent=synapse_project_id, includeRowIdAndRowVersion=False)

    syn.store(Table(schema, table_data))
コード例 #22
0
def test_tables_pandas():
    # create a pandas DataFrame
    df = pd.DataFrame({
        'A': ("foo", "bar", "baz", "qux", "asdf"),
        'B':
        tuple(0.42 * i for i in range(5)),
        'C': (101, 202, 303, 404, 505),
        'D': (False, True, False, True, False),
        # additional data types supported since SYNPY-347
        'int64':
        tuple(np.int64(range(5))),
        'datetime64':
        tuple(
            np.datetime64(d) for d in [
                '2005-02-01', '2005-02-02', '2005-02-03', '2005-02-04',
                '2005-02-05'
            ]),
        'string_':
        tuple(
            np.string_(s)
            for s in ['urgot', 'has', 'dark', 'mysterious', 'past'])
    })

    cols = as_table_columns(df)
    cols[0].maximumSize = 20
    schema = Schema(name="Nifty Table", columns=cols, parent=project)

    # store in Synapse
    table = syn.store(Table(schema, df))

    # retrieve the table and verify
    results = syn.tableQuery('select * from %s' % table.schema.id,
                             resultsAs='csv')
    df2 = results.asDataFrame(convert_to_datetime=True)

    # simulate rowId-version rownames for comparison
    df.index = ['%s_0' % i for i in range(5)]

    # for python3 we need to convert from numpy.bytes_ to str or the equivalence comparision fails
    if six.PY3:
        df['string_'] = df['string_'].transform(str)

    # SYNPY-717
    df['datetime64'] = df['datetime64'].apply(
        lambda x: pd.Timestamp(x).tz_localize('UTC'))

    # df2 == df gives Dataframe of boolean values; first .all() gives a Series object of ANDed booleans of each column;
    # second .all() gives a bool that is ANDed value of that Series

    assert_frame_equal(df2, df)
コード例 #23
0
def test_pandas_to_table():
    df = pd.DataFrame(dict(a=[1, 2, 3], b=["c", "d", "e"]))
    schema = Schema(name="Baz",
                    parent="syn12345",
                    columns=as_table_columns(df))

    # A dataframe with no row id and version
    table = Table(schema, df)

    for i, row in enumerate(table):
        assert_equals(row[0], (i + 1))
        assert_equals(row[1], ["c", "d", "e"][i])

        assert_equals(len(table), 3)

    # If includeRowIdAndRowVersion=True, include empty row id an versions
    # ROW_ID,ROW_VERSION,a,b
    # ,,1,c
    # ,,2,d
    # ,,3,e
    table = Table(schema, df, includeRowIdAndRowVersion=True)
    for i, row in enumerate(table):
        assert_is_none(row[0])
        assert_is_none(row[1])
        assert_equals(row[2], (i + 1))

    # A dataframe with no row id and version
    df = pd.DataFrame(index=["1_7", "2_7", "3_8"],
                      data=dict(a=[100, 200, 300], b=["c", "d", "e"]))

    table = Table(schema, df)
    for i, row in enumerate(table):
        assert_equals(row[0], ["1", "2", "3"][i])
        assert_equals(row[1], ["7", "7", "8"][i])
        assert_equals(row[2], (i + 1) * 100)
        assert_equals(row[3], ["c", "d", "e"][i])

    # A dataframe with row id and version in columns
    df = pd.DataFrame(
        dict(ROW_ID=["0", "1", "2"],
             ROW_VERSION=["8", "9", "9"],
             a=[100, 200, 300],
             b=["c", "d", "e"]))

    table = Table(schema, df)
    for i, row in enumerate(table):
        assert_equals(row[0], ["0", "1", "2"][i])
        assert_equals(row[1], ["8", "9", "9"][i])
        assert_equals(row[2], (i + 1) * 100)
        assert_equals(row[3], ["c", "d", "e"][i])
コード例 #24
0
def test_as_table_columns__with_csv_file():
    string_io = StringIOContextManager(
        'ROW_ID,ROW_VERSION,Name,Born,Hipness,Living\n'
        '"1", "1", "John Coltrane", 1926, 8.65, False\n'
        '"2", "1", "Miles Davis", 1926, 9.87, False')
    cols = as_table_columns(string_io)

    assert_equals(cols[0]['name'], 'Name')
    assert_equals(cols[0]['columnType'], 'STRING')
    assert_equals(cols[1]['name'], 'Born')
    assert_equals(cols[1]['columnType'], 'INTEGER')
    assert_equals(cols[2]['name'], 'Hipness')
    assert_equals(cols[2]['columnType'], 'DOUBLE')
    assert_equals(cols[3]['name'], 'Living')
    assert_equals(cols[3]['columnType'], 'STRING')
コード例 #25
0
 def test_iter_with_file_view_row_metadata(self):
     # csv file and self.headers contain matching row metadata
     data = "ROW_ID,ROW_VERSION,ROW_ETAG,col\n" \
            "1,2,etag1,\"I like trains\"\n" \
            "5,1,etag2,\"weeeeeeeeeeee\"\n"
     cols = as_table_columns(StringIOContextManager(data))
     headers = [SelectColumn(name="ROW_ID", columnType="STRING"),
                SelectColumn(name="ROW_VERSION", columnType="STRING"),
                SelectColumn(name="ROW_ETAG", columnType="STRING")] + \
         [SelectColumn.from_column(col) for col in cols]
     with patch.object(io, "open", return_value=StringIOContextManager(data)):
         table = CsvFileTable("syn123", "/fake/file/path", headers=headers)
         expected_rows = [['1', '2', "etag1", "I like trains"],
                          ['5', '1', "etag2", "weeeeeeeeeeee"]]
         for expected_row, table_row in zip(expected_rows, table):
             assert expected_row == table_row
コード例 #26
0
 def test_iter_with_file_view_row_metadata(self):
     # csv file and self.headers contain matching row metadata
     data = "ROW_ID,ROW_VERSION,ROW_ETAG,col\n" \
            "1,2,etag1,\"I like trains\"\n" \
            "5,1,etag2,\"weeeeeeeeeeee\"\n"
     cols = as_table_columns(StringIOContextManager(data))
     headers = [SelectColumn(name="ROW_ID", columnType="STRING"),
                SelectColumn(name="ROW_VERSION", columnType="STRING"),
                SelectColumn(name="ROW_ETAG", columnType="STRING")] + \
               [SelectColumn.from_column(col) for col in cols]
     with patch.object(io, "open", return_value=StringIOContextManager(data)):
         table = CsvFileTable("syn123", "/fake/file/path", headers=headers)
         expected_rows = [['1', '2', "etag1", "I like trains"],
                          ['5', '1', "etag2", "weeeeeeeeeeee"]]
         for expected_row, table_row in zip(expected_rows, table):
             assert_equals(expected_row, table_row)
コード例 #27
0
def test_as_table_columns__with_csv_file():
    string_io = StringIOContextManager(
        'ROW_ID,ROW_VERSION,Name,Born,Hipness,Living\n'
        '"1", "1", "John Coltrane", 1926, 8.65, False\n'
        '"2", "1", "Miles Davis", 1926, 9.87, False'
    )
    cols = as_table_columns(string_io)

    assert_equals(cols[0]['name'], 'Name')
    assert_equals(cols[0]['columnType'], 'STRING')
    assert_equals(cols[1]['name'], 'Born')
    assert_equals(cols[1]['columnType'], 'INTEGER')
    assert_equals(cols[2]['name'], 'Hipness')
    assert_equals(cols[2]['columnType'], 'DOUBLE')
    assert_equals(cols[3]['name'], 'Living')
    assert_equals(cols[3]['columnType'], 'STRING')
コード例 #28
0
def test_pandas_to_table():
    df = pd.DataFrame(dict(a=[1, 2, 3], b=["c", "d", "e"]))
    schema = Schema(name="Baz", parent="syn12345", columns=as_table_columns(df))

    # A dataframe with no row id and version
    table = Table(schema, df)

    for i, row in enumerate(table):
        assert_equals(row[0], (i+1))
        assert_equals(row[1], ["c", "d", "e"][i])

        assert_equals(len(table), 3)

    # If includeRowIdAndRowVersion=True, include empty row id an versions
    # ROW_ID,ROW_VERSION,a,b
    # ,,1,c
    # ,,2,d
    # ,,3,e
    table = Table(schema, df, includeRowIdAndRowVersion=True)
    for i, row in enumerate(table):
        assert_is_none(row[0])
        assert_is_none(row[1])
        assert_equals(row[2], (i+1))

    # A dataframe with no row id and version
    df = pd.DataFrame(index=["1_7", "2_7", "3_8"], data=dict(a=[100, 200, 300], b=["c", "d", "e"]))

    table = Table(schema, df)
    for i, row in enumerate(table):
        assert_equals(row[0], ["1", "2", "3"][i])
        assert_equals(row[1], ["7", "7", "8"][i])
        assert_equals(row[2], (i+1)*100)
        assert_equals(row[3], ["c", "d", "e"][i])

    # A dataframe with row id and version in columns
    df = pd.DataFrame(dict(ROW_ID=["0", "1", "2"], ROW_VERSION=["8", "9", "9"], a=[100, 200, 300], b=["c", "d", "e"]))

    table = Table(schema, df)
    for i, row in enumerate(table):
        assert_equals(row[0], ["0", "1", "2"][i])
        assert_equals(row[1], ["8", "9", "9"][i])
        assert_equals(row[2], (i+1)*100)
        assert_equals(row[3], ["c", "d", "e"][i])
コード例 #29
0
ファイル: io_data.py プロジェクト: gciccarelli3/mhealthx
def concatenate_tables_to_synapse_table(frames,
                                        synapse_project_id,
                                        table_name,
                                        username='',
                                        password=''):
    """
    Concatenate multiple dataframes and store as a Synapse table.

    Reuse the indices from the original DataFrame,
    increasing number of columns.

    Parameters
    ----------
    frames : list of pandas DataFrames
        paths to files to upload to Synapse
    synapse_project_id : string
        Synapse ID for project to which table is to be written
    table_name : string
        schema name of table
    username : string
        Synapse username (only needed once on a given machine)
    password : string
        Synapse password (only needed once on a given machine)

    Returns
    -------
    table_data : Pandas DataFrame
        output table
    synapse_project_id : string
        Synapse ID for project

    Examples
    --------
    >>> import pandas as pd
    >>> from mhealthx.io_data import concatenate_tables_to_synapse_table
    >>> df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
    >>>                     'B': ['B0', 'B1', 'B2', 'B3'],
    >>>                     'C': ['C0', 'C1', 'C2', 'C3'],
    >>>                     'D': ['D0', 'D1', 'D2', 'D3']},
    >>>                    index=[0, 1, 2, 3])
    >>> df2 = pd.DataFrame({'E': ['A4', 'A5', 'A6', 'A7'],
    >>>                     'F': ['B4', 'B5', 'B6', 'B7'],
    >>>                     'G': ['C4', 'C5', 'C6', 'C7'],
    >>>                     'H': ['D4', 'D5', 'D6', 'D7']},
    >>>                     index=[0, 1, 2, 3])
    >>> frames = [df1, df2]
    >>> synapse_project_id = 'syn4899451'
    >>> table_name = 'Test to join tables'
    >>> username = ''
    >>> password = ''
    >>> table_data, synapse_project_id = concatenate_tables_to_synapse_table(frames, synapse_project_id, table_name, username, password)
    """
    import pandas as pd
    import synapseclient
    from synapseclient import Schema, Table, as_table_columns

    syn = synapseclient.Synapse()

    # Log in to Synapse:
    if username and password:
        syn.login(username, password)
    else:
        syn.login()

    # Concatenate dataframes: reuse the indices from the original DataFrame,
    # increasing number of columns:
    table_data = pd.concat(frames, axis=1)  #, join_axes=[frames[0].index])

    # Create table schema:
    schema = Schema(name=table_name,
                    columns=as_table_columns(table_data),
                    parent=synapse_project_id)

    # Store as Synapse table:
    table = syn.store(Table(schema, table_data))

    return table_data, synapse_project_id
コード例 #30
0
ファイル: io_data.py プロジェクト: gciccarelli3/mhealthx
def copy_synapse_table(synapse_table_id, synapse_project_id, table_name='',
                       remove_columns=[], username='', password=''):
    """
    Copy Synapse table to another Synapse project.

    Parameters
    ----------
    synapse_table_id : string
        Synapse ID for table to copy
    synapse_project_id : string
        copy table to project with this Synapse ID
    table_name : string
        schema name of table
    remove_columns : list of strings
        column headers for columns to be removed
    username : string
        Synapse username (only needed once on a given machine)
    password : string
        Synapse password (only needed once on a given machine)

    Returns
    -------
    table_data : Pandas DataFrame
        Synapse table contents
    table_name : string
        schema name of table
    synapse_project_id : string
        Synapse ID for project within which table is to be written

    Examples
    --------
    >>> from mhealthx.io_data import copy_synapse_table
    >>> synapse_table_id = 'syn4590865'
    >>> synapse_project_id = 'syn4899451'
    >>> table_name = 'Copy of ' + synapse_table_id
    >>> remove_columns = ['audio_audio.m4a', 'audio_countdown.m4a']
    >>> username = ''
    >>> password = ''
    >>> table_data, table_name, synapse_project_id = copy_synapse_table(synapse_table_id, synapse_project_id, table_name, remove_columns, username, password)

    """
    import synapseclient
    from synapseclient import Schema
    from synapseclient.table import Table, as_table_columns

    syn = synapseclient.Synapse()

    # Log in to Synapse:
    if username and password:
        syn.login(username, password)
    else:
        syn.login()

    # Download Synapse table as a dataframe:
    results = syn.tableQuery("select * from {0}".format(synapse_table_id))
    table_data = results.asDataFrame()

    # Remove specified columns:
    if remove_columns:
        for remove_column in remove_columns:
            del table_data[remove_column]

    # Upload to Synapse table:
    table_data.index = range(table_data.shape[0])
    schema = Schema(name=table_name, columns=as_table_columns(table_data),
                    parent=synapse_project_id,
                    includeRowIdAndRowVersion=False)
    table = syn.store(Table(schema, table_data))

    return table_data, table_name, synapse_project_id
コード例 #31
0
ファイル: io_data.py プロジェクト: gciccarelli3/mhealthx
def copy_synapse_table(synapse_table_id,
                       synapse_project_id,
                       table_name='',
                       remove_columns=[],
                       username='',
                       password=''):
    """
    Copy Synapse table to another Synapse project.

    Parameters
    ----------
    synapse_table_id : string
        Synapse ID for table to copy
    synapse_project_id : string
        copy table to project with this Synapse ID
    table_name : string
        schema name of table
    remove_columns : list of strings
        column headers for columns to be removed
    username : string
        Synapse username (only needed once on a given machine)
    password : string
        Synapse password (only needed once on a given machine)

    Returns
    -------
    table_data : Pandas DataFrame
        Synapse table contents
    table_name : string
        schema name of table
    synapse_project_id : string
        Synapse ID for project within which table is to be written

    Examples
    --------
    >>> from mhealthx.io_data import copy_synapse_table
    >>> synapse_table_id = 'syn4590865'
    >>> synapse_project_id = 'syn4899451'
    >>> table_name = 'Copy of ' + synapse_table_id
    >>> remove_columns = ['audio_audio.m4a', 'audio_countdown.m4a']
    >>> username = ''
    >>> password = ''
    >>> table_data, table_name, synapse_project_id = copy_synapse_table(synapse_table_id, synapse_project_id, table_name, remove_columns, username, password)

    """
    import synapseclient
    from synapseclient import Schema
    from synapseclient.table import Table, as_table_columns

    syn = synapseclient.Synapse()

    # Log in to Synapse:
    if username and password:
        syn.login(username, password)
    else:
        syn.login()

    # Download Synapse table as a dataframe:
    results = syn.tableQuery("select * from {0}".format(synapse_table_id))
    table_data = results.asDataFrame()

    # Remove specified columns:
    if remove_columns:
        for remove_column in remove_columns:
            del table_data[remove_column]

    # Upload to Synapse table:
    table_data.index = range(table_data.shape[0])
    schema = Schema(name=table_name,
                    columns=as_table_columns(table_data),
                    parent=synapse_project_id,
                    includeRowIdAndRowVersion=False)
    table = syn.store(Table(schema, table_data))

    return table_data, table_name, synapse_project_id
コード例 #32
0
ファイル: io_data.py プロジェクト: gciccarelli3/mhealthx
def concatenate_tables_to_synapse_table(frames, synapse_project_id,
                                        table_name, username='', password=''):
    """
    Concatenate multiple dataframes and store as a Synapse table.

    Reuse the indices from the original DataFrame,
    increasing number of columns.

    Parameters
    ----------
    frames : list of pandas DataFrames
        paths to files to upload to Synapse
    synapse_project_id : string
        Synapse ID for project to which table is to be written
    table_name : string
        schema name of table
    username : string
        Synapse username (only needed once on a given machine)
    password : string
        Synapse password (only needed once on a given machine)

    Returns
    -------
    table_data : Pandas DataFrame
        output table
    synapse_project_id : string
        Synapse ID for project

    Examples
    --------
    >>> import pandas as pd
    >>> from mhealthx.io_data import concatenate_tables_to_synapse_table
    >>> df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
    >>>                     'B': ['B0', 'B1', 'B2', 'B3'],
    >>>                     'C': ['C0', 'C1', 'C2', 'C3'],
    >>>                     'D': ['D0', 'D1', 'D2', 'D3']},
    >>>                    index=[0, 1, 2, 3])
    >>> df2 = pd.DataFrame({'E': ['A4', 'A5', 'A6', 'A7'],
    >>>                     'F': ['B4', 'B5', 'B6', 'B7'],
    >>>                     'G': ['C4', 'C5', 'C6', 'C7'],
    >>>                     'H': ['D4', 'D5', 'D6', 'D7']},
    >>>                     index=[0, 1, 2, 3])
    >>> frames = [df1, df2]
    >>> synapse_project_id = 'syn4899451'
    >>> table_name = 'Test to join tables'
    >>> username = ''
    >>> password = ''
    >>> table_data, synapse_project_id = concatenate_tables_to_synapse_table(frames, synapse_project_id, table_name, username, password)
    """
    import pandas as pd
    import synapseclient
    from synapseclient import Schema, Table, as_table_columns

    syn = synapseclient.Synapse()

    # Log in to Synapse:
    if username and password:
        syn.login(username, password)
    else:
        syn.login()

    # Concatenate dataframes: reuse the indices from the original DataFrame,
    # increasing number of columns:
    table_data = pd.concat(frames, axis=1) #, join_axes=[frames[0].index])

    # Create table schema:
    schema = Schema(name=table_name, columns=as_table_columns(table_data),
                    parent=synapse_project_id)

    # Store as Synapse table:
    table = syn.store(Table(schema, table_data))

    return table_data, synapse_project_id