Ejemplo n.º 1
0
def test_EntityViewSchema__ignore_annotation_column_names():
    syn = synapseclient.client.Synapse(debug=True, skip_checks=True)

    scopeIds = ['123']
    entity_view = EntityViewSchema("someName",
                                   scopes=scopeIds,
                                   parent="syn123",
                                   ignoredAnnotationColumnNames={'long1'},
                                   addDefaultViewColumns=False,
                                   addAnnotationColumns=True)

    mocked_annotation_result1 = [
        Column(name='long1', columnType='INTEGER'),
        Column(name='long2', columnType='INTEGER')
    ]

    with patch.object(syn, '_get_annotation_entity_view_columns', return_value=mocked_annotation_result1) as mocked_get_annotations,\
         patch.object(syn, 'getColumns') as mocked_get_columns,\
         patch.object(SchemaBase, "_before_synapse_store"):

        entity_view._before_synapse_store(syn)

        mocked_get_columns.assert_called_once_with([])
        mocked_get_annotations.assert_called_once_with(scopeIds, 'file')

        assert_equals([Column(name='long2', columnType='INTEGER')],
                      entity_view.columns_to_store)
Ejemplo n.º 2
0
    def _view_setup(cls):
        # set up a file view
        folder = syn.store(
            Folder(name="PartialRowTestFolder" + str(uuid.uuid4()),
                   parent=project))
        syn.store(
            File("~/path/doesnt/matter",
                 name="f1",
                 parent=folder,
                 synapseStore=False))
        syn.store(
            File("~/path/doesnt/matter/again",
                 name="f2",
                 parent=folder,
                 synapseStore=False))

        cols = [
            Column(name='foo', columnType='STRING', maximumSize=1000),
            Column(name='bar', columnType='STRING')
        ]
        return syn.store(
            EntityViewSchema(name='PartialRowTestViews' + str(uuid.uuid4()),
                             columns=cols,
                             addDefaultViewColumns=False,
                             parent=project,
                             scopes=[folder]))
Ejemplo n.º 3
0
def dontruntest_big_tables():
    cols = []
    cols.append(Column(name='name', columnType='STRING', maximumSize=1000))
    cols.append(
        Column(name='foo',
               columnType='STRING',
               enumValues=['foo', 'bar', 'bat']))
    cols.append(Column(name='x', columnType='DOUBLE'))
    cols.append(Column(name='n', columnType='INTEGER'))
    cols.append(Column(name='is_bogus', columnType='BOOLEAN'))

    table1 = syn.store(Schema(name='Big Table', columns=cols, parent=project))

    rows_per_append = 10

    for i in range(1000):
        rows = []
        for j in range(rows_per_append):
            foo = cols[1].enumValues[random.randint(0, 2)]
            rows.append(
                Row(('Robot ' + str(i * rows_per_append + j), foo,
                     random.random() * 200.0, random.randint(0, 100),
                     random.random() >= 0.5)))
        rowset1 = syn.store(RowSet(columns=cols, schema=table1, rows=rows))

    results = syn.tableQuery("select * from %s" % table1.id)

    results = syn.tableQuery(
        "select n, COUNT(n), MIN(x), AVG(x), MAX(x), SUM(x) from %s group by n"
        % table1.id)
    df = results.asDataFrame()
def test_list_of_rows_table():
    data = [["John Coltrane",  1926, 8.65, False],
            ["Miles Davis",    1926, 9.87, False],
            ["Bill Evans",     1929, 7.65, False],
            ["Paul Chambers",  1935, 5.14, False],
            ["Jimmy Cobb",     1929, 5.78, True],
            ["Scott LaFaro",   1936, 4.21, False],
            ["Sonny Rollins",  1930, 8.99, True],
            ["Kenny Burrel",   1931, 4.37, True]]

    cols = [Column(id='1', name='Name', columnType='STRING'),
            Column(id='2', name='Born', columnType='INTEGER'),
            Column(id='3', name='Hipness', columnType='DOUBLE'),
            Column(id='4', name='Living', columnType='BOOLEAN')]

    schema1 = Schema(name='Jazz Guys', columns=cols, id="syn1000002", parent="syn1000001")

    # need columns to do cast_values w/o storing
    table = Table(schema1, data, headers=[SelectColumn.from_column(col) for col in cols])

    for table_row, expected_row in zip(table, data):
        assert table_row == expected_row

    rowset = table.asRowSet()
    for rowset_row, expected_row in zip(rowset.rows, data):
        assert rowset_row['values'] == expected_row

    table.columns = cols

    df = table.asDataFrame()
    assert list(df['Name']) == [r[0] for r in data]
Ejemplo n.º 5
0
def test_build_table_download_file_handle_list__repeated_file_handles():
    syn = synapseclient.client.Synapse(debug=True, skip_checks=True)

    #patch the cache so we don't look there in case FileHandle ids actually exist there
    patch.object(syn.cache, "get", return_value=None)

    cols = [
        Column(name='Name', columnType='STRING', maximumSize=50),
        Column(name='filehandle', columnType='FILEHANDLEID')
    ]

    schema = Schema(name='FileHandleTest', columns=cols, parent='syn420')

    #using some large filehandle numbers so i don
    data = [["ayy lmao", 5318008], ["large numberino", 0x5f3759df],
            ["repeated file handle", 5318008],
            ["repeated file handle also", 0x5f3759df]]

    ## need columns to do cast_values w/o storing
    table = Table(schema,
                  data,
                  headers=[SelectColumn.from_column(col) for col in cols])

    file_handle_associations, file_handle_to_path_map = syn._build_table_download_file_handle_list(
        table, ['filehandle'])

    #verify only 2 file_handles are added (repeats were ignored)
    assert_equals(2, len(file_handle_associations))
    assert_equals(0,
                  len(file_handle_to_path_map))  #might as well check anyways
Ejemplo n.º 6
0
def test_synapse_integer_columns_with_missing_values_from_dataframe():
    #SYNPY-267
    cols = [
        Column(name='x', columnType='STRING'),
        Column(name='y', columnType='INTEGER'),
        Column(name='z', columnType='DOUBLE')
    ]
    schema = syn.store(Schema(name='Big Table', columns=cols, parent=project))

    ## write rows to CSV file
    with tempfile.NamedTemporaryFile(mode="w", suffix=".csv",
                                     delete=False) as temp:
        schedule_for_cleanup(temp.name)
        #2nd row is missing a value in its integer column
        temp.write('x,y,z\na,1,0.9\nb,,0.8\nc,3,0.7\n')
        temp.flush()
        filename = temp.name

    #create a table from csv
    table = Table(schema, filename)
    df = table.asDataFrame()

    table_from_dataframe = Table(schema, df)
    assert_not_equal(table.filepath, table_from_dataframe.filepath)
    #compare to make sure no .0's were appended to the integers
    assert filecmp.cmp(table.filepath, table_from_dataframe.filepath)
Ejemplo n.º 7
0
def test_synapse_integer_columns_with_missing_values_from_dataframe():
    # SYNPY-267
    cols = [
        Column(name='x', columnType='STRING'),
        Column(name='y', columnType='INTEGER'),
        Column(name='z', columnType='DOUBLE')
    ]
    schema = syn.store(Schema(name='Big Table', columns=cols, parent=project))

    line_terminator = str(os.linesep)
    # write rows to CSV file
    with tempfile.NamedTemporaryFile(mode="w", suffix=".csv",
                                     delete=False) as temp:
        schedule_for_cleanup(temp.name)
        # 2nd row is missing a value in its integer column
        temp.write('x,y,z' + line_terminator + 'a,1,0.9' + line_terminator +
                   'b,,0.8' + line_terminator + 'c,3,0.7' + line_terminator)
        temp.flush()
        filename = temp.name

    # create a table from csv
    table = Table(schema, filename)
    df = table.asDataFrame()

    table_from_dataframe = Table(schema, df)
    assert_not_equal(table.filepath, table_from_dataframe.filepath)
    df2 = table_from_dataframe.asDataFrame()
    assert_frame_equal(df, df2)
Ejemplo n.º 8
0
def dontruntest_big_csvs():
    cols = [
        Column(name='name', columnType='STRING', maximumSize=1000),
        Column(name='foo',
               columnType='STRING',
               enumValues=['foo', 'bar', 'bat']),
        Column(name='x', columnType='DOUBLE'),
        Column(name='n', columnType='INTEGER'),
        Column(name='is_bogus', columnType='BOOLEAN')
    ]

    schema1 = syn.store(Schema(name='Big Table', columns=cols, parent=project))

    # write rows to CSV file
    with tempfile.NamedTemporaryFile(delete=False) as temp:
        schedule_for_cleanup(temp.name)
        filename = temp.name

    with io.open(filename, mode='w', encoding="utf-8", newline='') as temp:
        writer = csv.writer(temp,
                            quoting=csv.QUOTE_NONNUMERIC,
                            lineterminator=str(os.linesep))
        writer.writerow([col.name for col in cols])

        for i in range(10):
            for j in range(100):
                foo = cols[1].enumValues[random.randint(0, 2)]
                writer.writerow(
                    ('Robot ' + str(i * 100 + j), foo, random.random() * 200.0,
                     random.randint(0, 100), random.random() >= 0.5))
    # upload CSV
    syn._uploadCsv(filepath=temp.name, schema=schema1)

    from synapseclient.table import CsvFileTable
    CsvFileTable.from_table_query(syn, "select * from %s" % schema1.id)
Ejemplo n.º 9
0
def test_tables_csv():

    # Define schema
    cols = [
        Column(name='Name', columnType='STRING'),
        Column(name='Born', columnType='INTEGER'),
        Column(name='Hipness', columnType='DOUBLE'),
        Column(name='Living', columnType='BOOLEAN')
    ]

    schema = Schema(name='Jazz Guys', columns=cols, parent=project)

    data = [["John Coltrane", 1926, 8.65, False],
            ["Miles Davis", 1926, 9.87, False],
            ["Bill Evans", 1929, 7.65, False],
            ["Paul Chambers", 1935, 5.14, False],
            ["Jimmy Cobb", 1929, 5.78, True],
            ["Scott LaFaro", 1936, 4.21, False],
            ["Sonny Rollins", 1930, 8.99, True],
            ["Kenny Burrel", 1931, 4.37, True]]

    # the following creates a CSV file and uploads it to create a new table
    table = syn.store(Table(schema, data))

    # Query and download an identical CSV
    results = syn.tableQuery("select * from %s" % table.schema.id,
                             resultsAs="csv",
                             includeRowIdAndRowVersion=False)

    # Test that CSV file came back as expected
    for expected_row, row in zip(data, results):
        assert_equals(expected_row, row,
                      "expected %s but got %s" % (expected_row, row))
 def test_input_is_SchemaBase(self):
     get_table_colums_results = [Column(name='A'), Column(name='B')]
     with patch.object(syn, "getTableColumns", return_value=iter(get_table_colums_results))\
             as mock_get_table_coulmns:
         schema = EntityViewSchema(parentId="syn123")
         results = list(syn.getColumns(schema))
         assert_equal(get_table_colums_results, results)
         mock_get_table_coulmns.assert_called_with(schema)
Ejemplo n.º 11
0
def test_table_file_view_csv_update_annotations__includeEntityEtag():
    folder = syn.store(
        synapseclient.Folder(name="updateAnnoFolder" + str(uuid.uuid4()),
                             parent=project))
    anno1_name = "annotationColumn1"
    anno2_name = "annotationColumn2"
    initial_annotations = {
        anno1_name: "initial_value1",
        anno2_name: "initial_value2"
    }
    file_entity = syn.store(
        File(name=
             "test_table_file_view_csv_update_annotations__includeEntityEtag",
             path="~/fakepath",
             synapseStore=False,
             parent=folder,
             annotations=initial_annotations))

    annotation_columns = [
        Column(name=anno1_name, columnType='STRING'),
        Column(name=anno2_name, columnType='STRING')
    ]
    entity_view = syn.store(
        EntityViewSchema(name="TestEntityViewSchemaUpdateAnnotation" +
                         str(uuid.uuid4()),
                         parent=project,
                         scopes=[folder],
                         columns=annotation_columns))

    query_str = "SELECT {anno1}, {anno2} FROM {proj_id}".format(
        anno1=anno1_name, anno2=anno2_name, proj_id=utils.id_of(entity_view))

    #modify first annotation using rowset
    rowset_query_result = syn.tableQuery(query_str, resultsAs="rowset")
    rowset = rowset_query_result.asRowSet()
    rowset_changed_anno_value = "rowset_value_change"
    rowset.rows[0].values[0] = rowset_changed_anno_value
    syn.store(rowset)

    #modify second annotation using csv
    csv_query_result = syn.tableQuery(query_str, resultsAs="csv")
    dataframe = csv_query_result.asDataFrame()
    csv_changed_anno_value = "csv_value_change"
    dataframe.ix[0, anno2_name] = csv_changed_anno_value
    syn.store(Table(utils.id_of(entity_view), dataframe))

    #check annotations in the file entity. Annotations may not be immediately updated so we wait in while loop
    expected_annotations = {
        anno1_name: [rowset_changed_anno_value],
        anno2_name: [csv_changed_anno_value]
    }
    start_time = time.time()
    while (expected_annotations != file_entity.annotations):
        assert_less(time.time() - start_time, QUERY_TIMEOUT_SEC)
        time.sleep(2)
        file_entity = syn.get(file_entity, downloadFile=False)
Ejemplo n.º 12
0
 def _table_setup(cls):
     # set up a table
     cols = [
         Column(name='foo', columnType='INTEGER'),
         Column(name='bar', columnType='INTEGER')
     ]
     schema = syn.store(
         Schema(name='PartialRowTest' + str(uuid.uuid4()),
                columns=cols,
                parent=project))
     data = [[1, None], [None, 2]]
     syn.store(RowSet(schema=schema, rows=[Row(r) for r in data]))
     return schema
Ejemplo n.º 13
0
 def _table_setup(cls):
     # set up a table
     cols = [
         Column(name='foo', columnType='STRING', maximumSize=1000),
         Column(name='bar', columnType='STRING')
     ]
     schema = syn.store(
         Schema(name='PartialRowTest' + str(uuid.uuid4()),
                columns=cols,
                parent=project))
     data = [['foo1', None], [None, 'bar2']]
     syn.store(RowSet(schema=schema, rows=[Row(r) for r in data]))
     return schema
Ejemplo n.º 14
0
def test_download_table_files():
    cols = [
        Column(name='artist', columnType='STRING', maximumSize=50),
        Column(name='album', columnType='STRING', maximumSize=50),
        Column(name='year', columnType='INTEGER'),
        Column(name='catalog', columnType='STRING', maximumSize=50),
        Column(name='cover', columnType='FILEHANDLEID')
    ]

    schema = syn.store(Schema(name='Jazz Albums', columns=cols,
                              parent=project))
    schedule_for_cleanup(schema)

    data = [[
        "John Coltrane", "Blue Train", 1957, "BLP 1577",
        "coltraneBlueTrain.jpg"
    ], ["Sonny Rollins", "Vol. 2", 1957, "BLP 1558", "rollinsBN1558.jpg"],
            [
                "Sonny Rollins", "Newk's Time", 1958, "BLP 4001",
                "rollinsBN4001.jpg"
            ],
            [
                "Kenny Burrel", "Kenny Burrel", 1956, "BLP 1543",
                "burrellWarholBN1543.jpg"
            ]]

    ## upload files and store file handle ids
    original_files = []
    for row in data:
        path = utils.make_bogus_data_file()
        original_files.append(path)
        schedule_for_cleanup(path)
        file_handle = syn._chunkedUploadFile(path)
        row[4] = file_handle['id']

    row_reference_set = syn.store(
        RowSet(columns=cols, schema=schema, rows=[Row(r) for r in data]))

    ## retrieve the files for each row and verify that they are identical to the originals
    results = syn.tableQuery(
        'select artist, album, year, catalog, cover from %s' % schema.id,
        resultsAs="rowset")
    for i, row in enumerate(results):
        print "%s_%s" % (row.rowId, row.versionNumber), row.values
        file_info = syn.downloadTableFile(results,
                                          rowId=row.rowId,
                                          versionNumber=row.versionNumber,
                                          column='cover',
                                          downloadLocation='.')
        assert filecmp.cmp(original_files[i], file_info['path'])
        schedule_for_cleanup(file_info['path'])
def test_EntityViewSchema__repeated_columnName_same_type(syn):
    syn = Synapse(debug=True, skip_checks=True)

    entity_view = EntityViewSchema("someName", parent="syn123")

    columns = [Column(name='annoName', columnType='INTEGER'),
               Column(name='annoName', columnType='INTEGER')]

    with patch.object(syn, 'getColumns') as mocked_get_columns:
        filtered_results = entity_view._filter_duplicate_columns(syn, columns)

        mocked_get_columns.assert_called_once_with([])
        assert 1 == len(filtered_results)
        assert Column(name='annoName', columnType='INTEGER') == filtered_results[0]
Ejemplo n.º 16
0
def test_list_of_rows_table():
    data = [["John Coltrane", 1926, 8.65, False],
            ["Miles Davis", 1926, 9.87, False],
            ["Bill Evans", 1929, 7.65, False],
            ["Paul Chambers", 1935, 5.14, False],
            ["Jimmy Cobb", 1929, 5.78, True],
            ["Scott LaFaro", 1936, 4.21, False],
            ["Sonny Rollins", 1930, 8.99, True],
            ["Kenny Burrel", 1931, 4.37, True]]

    cols = []
    cols.append(Column(id='1', name='Name', columnType='STRING'))
    cols.append(Column(id='2', name='Born', columnType='INTEGER'))
    cols.append(Column(id='3', name='Hipness', columnType='DOUBLE'))
    cols.append(Column(id='4', name='Living', columnType='BOOLEAN'))

    schema1 = Schema(name='Jazz Guys',
                     columns=cols,
                     id="syn1000002",
                     parent="syn1000001")

    ## need columns to do cast_values w/o storing
    table = Table(schema1,
                  data,
                  headers=[SelectColumn.from_column(col) for col in cols])

    for table_row, expected_row in zip(table, data):
        assert table_row == expected_row

    rowset = table.asRowSet()
    for rowset_row, expected_row in zip(rowset.rows, data):
        assert rowset_row['values'] == expected_row

    table.columns = cols

    ## test asDataFrame
    try:
        import pandas as pd

        df = table.asDataFrame()
        assert all(df['Name'] == [r[0] for r in data])

    except ImportError as e1:
        sys.stderr.write(
            'Pandas is apparently not installed, skipping asDataFrame portion of test_list_of_rows_table.\n\n'
        )
Ejemplo n.º 17
0
def test_RowSetTable_len():
    schema = Schema(parentId="syn123",
                    id='syn456',
                    columns=[Column(name='column_name', id='123')])
    rowset = RowSet(schema=schema,
                    rows=[Row(['first row']),
                          Row(['second row'])])
    row_set_table = RowSetTable(schema, rowset)
    assert_equals(2, len(row_set_table))
Ejemplo n.º 18
0
def dontruntest_big_csvs():
    cols = []
    cols.append(Column(name='name', columnType='STRING', maximumSize=1000))
    cols.append(
        Column(name='foo',
               columnType='STRING',
               enumValues=['foo', 'bar', 'bat']))
    cols.append(Column(name='x', columnType='DOUBLE'))
    cols.append(Column(name='n', columnType='INTEGER'))
    cols.append(Column(name='is_bogus', columnType='BOOLEAN'))

    schema1 = syn.store(Schema(name='Big Table', columns=cols, parent=project))

    print "Created table:", schema1.id
    print "with columns:", schema1.columnIds

    ## write rows to CSV file
    with tempfile.NamedTemporaryFile(delete=False) as temp:
        schedule_for_cleanup(temp.name)
        writer = csv.writer(temp,
                            quoting=csv.QUOTE_NONNUMERIC,
                            lineterminator=os.linesep)
        writer.writerow([col.name for col in cols])

        for i in range(10):
            for j in range(100):
                foo = cols[1].enumValues[random.randint(0, 2)]
                writer.writerow(
                    ('Robot ' + str(i * 100 + j), foo, random.random() * 200.0,
                     random.randint(0, 100), random.random() >= 0.5))
            print "wrote 100 rows to disk"

    ## upload CSV
    UploadToTableResult = syn._uploadCsv(filepath=temp.name, schema=schema1)

    from synapseclient.table import CsvFileTable
    results = CsvFileTable.from_table_query(syn,
                                            "select * from %s" % schema1.id)
    print "etag:", results.etag
    print "tableId:", results.tableId

    for row in results:
        print row
Ejemplo n.º 19
0
def test_build_table__with_csv():
    string_io = StringIOContextManager('a,b\n' '1,c\n' '2,d\n' '3,e')
    with patch.object(synapseclient.table, "as_table_columns",
                      return_value=[Column(name="a", columnType="INTEGER"),
                                    Column(name="b", columnType="STRING")]),\
         patch.object(io, "open", return_value=string_io):
        table = build_table("test", "syn123", "some_file_name")
        for col, row in enumerate(table):
            assert_equals(row[0], (col + 1))
            assert_equals(row[1], ["c", "d", "e"][col])
        assert_equals(len(table), 3)
        headers = [{
            'name': 'a',
            'columnType': 'INTEGER'
        }, {
            'name': 'b',
            'columnType': 'STRING'
        }]
        assert_equals(headers, table.headers)
Ejemplo n.º 20
0
def test_EntityViewSchema__repeated_columnName_different_type():
    syn = synapseclient.client.Synapse(debug=True, skip_checks=True)

    scopeIds = ['123']
    entity_view = EntityViewSchema("someName",
                                   scopes=scopeIds,
                                   parent="syn123")

    columns = [
        Column(name='annoName', columnType='INTEGER'),
        Column(name='annoName', columnType='DOUBLE')
    ]

    with patch.object(syn, 'getColumns') as mocked_get_columns:

        filtered_results = entity_view._filter_duplicate_columns(syn, columns)

        mocked_get_columns.assert_called_once_with([])
        assert_equals(2, len(filtered_results))
        assert_equals(columns, filtered_results)
Ejemplo n.º 21
0
def test_EntityViewSchema__repeated_columnName():
    syn = synapseclient.client.Synapse(debug=True, skip_checks=True)

    scopeIds = ['123']
    entity_view = EntityViewSchema("someName",
                                   scopes=scopeIds,
                                   parent="syn123")

    mocked_annotation_result1 = [
        Column(name='annoName', columnType='INTEGER'),
        Column(name='annoName', columnType='DOUBLE')
    ]

    with patch.object(syn, '_get_annotation_entity_view_columns', return_value=mocked_annotation_result1) as mocked_get_annotations,\
         patch.object(syn, 'getColumns') as mocked_get_columns:

        assert_raises(ValueError, entity_view._add_annotations_as_columns, syn)

        mocked_get_columns.assert_called_once_with([])
        mocked_get_annotations.assert_called_once_with(scopeIds, 'file')
Ejemplo n.º 22
0
def test_store_table_datetime():
    current_datetime = datetime.fromtimestamp(round(time.time(), 3))
    schema = syn.store(
        Schema("testTable", [Column(name="testerino", columnType='DATE')],
               project))
    rowset = RowSet(rows=[Row([current_datetime])], schema=schema)
    rowset_table = syn.store(Table(schema, rowset))

    query_result = syn.tableQuery("select * from %s" % id_of(schema),
                                  resultsAs="rowset")
    assert_equals(current_datetime,
                  query_result.rowset['rows'][0]['values'][0])
Ejemplo n.º 23
0
def test_rowset_tables():
    cols = [
        Column(name='name', columnType='STRING', maximumSize=1000),
        Column(name='foo',
               columnType='STRING',
               enumValues=['foo', 'bar', 'bat']),
        Column(name='x', columnType='DOUBLE'),
        Column(name='age', columnType='INTEGER'),
        Column(name='cartoon', columnType='BOOLEAN'),
        Column(name='description', columnType='LARGETEXT')
    ]

    schema1 = syn.store(Schema(name='Foo Table', columns=cols, parent=project))

    data1 = [['Chris', 'bar', 11.23, 45, False, 'a'],
             ['Jen', 'bat', 14.56, 40, False, 'b'],
             ['Jane', 'bat', 17.89, 6, False, 'c' * 1002],
             ['Henry', 'bar', 10.12, 1, False, 'd']]
    row_reference_set1 = syn.store(
        RowSet(schema=schema1, rows=[Row(r) for r in data1]))
    assert_equals(len(row_reference_set1['rows']), 4)
Ejemplo n.º 24
0
def test_csv_table():
    ## Maybe not truly a unit test, but here because it doesn't do
    ## network IO to synapse
    data = [["1", "1", "John Coltrane", 1926, 8.65, False],
            ["2", "1", "Miles Davis", 1926, 9.87, False],
            ["3", "1", "Bill Evans", 1929, 7.65, False],
            ["4", "1", "Paul Chambers", 1935, 5.14, False],
            ["5", "1", "Jimmy Cobb", 1929, 5.78, True],
            ["6", "1", "Scott LaFaro", 1936, 4.21, False],
            ["7", "1", "Sonny Rollins", 1930, 8.99, True],
            ["8", "1", "Kenny Burrel", 1931, 4.37, True]]

    filename = None

    cols = []
    cols.append(Column(id='1', name='Name', columnType='STRING'))
    cols.append(Column(id='2', name='Born', columnType='INTEGER'))
    cols.append(Column(id='3', name='Hipness', columnType='DOUBLE'))
    cols.append(Column(id='4', name='Living', columnType='BOOLEAN'))

    schema1 = Schema(id='syn1234',
                     name='Jazz Guys',
                     columns=cols,
                     parent="syn1000001")

    #TODO: use StringIO.StringIO(data) rather than writing files
    try:
        ## create CSV file
        with tempfile.NamedTemporaryFile(delete=False) as temp:
            filename = temp.name

        with io.open(filename, mode='w', encoding="utf-8", newline='') as temp:
            writer = csv.writer(temp,
                                quoting=csv.QUOTE_NONNUMERIC,
                                lineterminator=str(os.linesep))
            headers = ['ROW_ID', 'ROW_VERSION'] + [col.name for col in cols]
            writer.writerow(headers)
            for row in data:
                writer.writerow(row)

        table = Table(schema1, filename)
        assert isinstance(table, CsvFileTable)

        ## need to set column headers to read a CSV file
        table.setColumnHeaders([
            SelectColumn(name="ROW_ID", columnType="STRING"),
            SelectColumn(name="ROW_VERSION", columnType="STRING")
        ] + [SelectColumn.from_column(col) for col in cols])

        ## test iterator
        for table_row, expected_row in zip(table, data):
            assert table_row == expected_row

        ## test asRowSet
        rowset = table.asRowSet()
        for rowset_row, expected_row in zip(rowset.rows, data):
            assert rowset_row['values'] == expected_row[2:]
            assert rowset_row['rowId'] == expected_row[0]
            assert rowset_row['versionNumber'] == expected_row[1]

        ## test asDataFrame
        try:
            import pandas as pd

            df = table.asDataFrame()
            assert all(df['Name'] == [row[2] for row in data])
            assert all(df['Born'] == [row[3] for row in data])
            assert all(df['Living'] == [row[5] for row in data])
            assert all(df.index == ['%s_%s' % tuple(row[0:2]) for row in data])
            assert df.shape == (8, 4)

        except ImportError as e1:
            sys.stderr.write(
                'Pandas is apparently not installed, skipping asDataFrame portion of test_csv_table.\n\n'
            )

    except Exception as ex1:
        if filename:
            try:
                if os.path.isdir(filename):
                    shutil.rmtree(filename)
                else:
                    os.remove(filename)
            except Exception as ex:
                print(ex)
        raise
Ejemplo n.º 25
0
def test_Schema__max_column_check():
    table = Schema(name="someName", parent="idk")
    table.addColumns(
        Column(name="colNum%s" % i, columnType="STRING")
        for i in range(synapseclient.table.MAX_NUM_TABLE_COLUMNS + 1))
    assert_raises(ValueError, syn.store, table)
Ejemplo n.º 26
0
def test_schema():
    schema = Schema(name='My Table', parent="syn1000001")

    assert not schema.has_columns()

    schema.addColumn(Column(id='1', name='Name', columnType='STRING'))

    assert schema.has_columns()
    assert schema.properties.columnIds == ['1']

    schema.removeColumn('1')
    assert not schema.has_columns()
    assert schema.properties.columnIds == []

    schema = Schema(name='Another Table', parent="syn1000001")

    schema.addColumns([
        Column(name='Name', columnType='STRING'),
        Column(name='Born', columnType='INTEGER'),
        Column(name='Hipness', columnType='DOUBLE'),
        Column(name='Living', columnType='BOOLEAN')
    ])
    assert schema.has_columns()
    assert len(schema.columns_to_store) == 4
    assert Column(name='Name', columnType='STRING') in schema.columns_to_store
    assert Column(name='Born', columnType='INTEGER') in schema.columns_to_store
    assert Column(name='Hipness',
                  columnType='DOUBLE') in schema.columns_to_store
    assert Column(name='Living',
                  columnType='BOOLEAN') in schema.columns_to_store

    schema.removeColumn(Column(name='Living', columnType='BOOLEAN'))
    assert schema.has_columns()
    assert len(schema.columns_to_store) == 3
    assert Column(name='Living',
                  columnType='BOOLEAN') not in schema.columns_to_store
    assert Column(name='Hipness',
                  columnType='DOUBLE') in schema.columns_to_store
Ejemplo n.º 27
0
def test_download_table_files():
    cols = [
        Column(name='artist', columnType='STRING', maximumSize=50),
        Column(name='album', columnType='STRING', maximumSize=50),
        Column(name='year', columnType='INTEGER'),
        Column(name='catalog', columnType='STRING', maximumSize=50),
        Column(name='cover', columnType='FILEHANDLEID')
    ]

    schema = syn.store(Schema(name='Jazz Albums', columns=cols,
                              parent=project))
    schedule_for_cleanup(schema)

    data = [[
        "John Coltrane", "Blue Train", 1957, "BLP 1577",
        "coltraneBlueTrain.jpg"
    ], ["Sonny Rollins", "Vol. 2", 1957, "BLP 1558", "rollinsBN1558.jpg"],
            [
                "Sonny Rollins", "Newk's Time", 1958, "BLP 4001",
                "rollinsBN4001.jpg"
            ],
            [
                "Kenny Burrel", "Kenny Burrel", 1956, "BLP 1543",
                "burrellWarholBN1543.jpg"
            ]]

    ## upload files and store file handle ids
    original_files = []
    for row in data:
        path = utils.make_bogus_data_file()
        original_files.append(path)
        schedule_for_cleanup(path)
        file_handle = syn.uploadFileHandle(path, project)
        row[4] = file_handle['id']

    row_reference_set = syn.store(
        RowSet(schema=schema, rows=[Row(r) for r in data]))

    ## retrieve the files for each row and verify that they are identical to the originals
    results = syn.tableQuery(
        "select artist, album, 'year', 'catalog', cover from %s" % schema.id,
        resultsAs="rowset")
    for i, row in enumerate(results):
        path = syn.downloadTableFile(results,
                                     rowId=row.rowId,
                                     versionNumber=row.versionNumber,
                                     column='cover')
        assert filecmp.cmp(original_files[i], path)
        schedule_for_cleanup(path)

    ## test that cached copies are returned for already downloaded files
    original_downloadFile_method = syn._downloadFileHandle
    with patch(
            "synapseclient.Synapse._downloadFileHandle") as _downloadFile_mock:
        _downloadFile_mock.side_effect = original_downloadFile_method

        results = syn.tableQuery(
            "select artist, album, 'year', 'catalog', cover from %s where artist = 'John Coltrane'"
            % schema.id,
            resultsAs="rowset")
        for i, row in enumerate(results):
            file_path = syn.downloadTableFile(results,
                                              rowId=row.rowId,
                                              versionNumber=row.versionNumber,
                                              column='cover')
            assert filecmp.cmp(original_files[i], file_path)

        assert not _downloadFile_mock.called, "Should have used cached copy of file and not called _downloadFile"

    ## test download table column
    results = syn.tableQuery('select * from %s' % schema.id)
    ## uncache 2 out of 4 files
    for i, row in enumerate(results):
        if i % 2 == 0:
            syn.cache.remove(row[6])
    file_map = syn.downloadTableColumns(results, ['cover'])
    assert len(file_map) == 4
    for row in results:
        filecmp.cmp(original_files[i], file_map[row[6]])
Ejemplo n.º 28
0
def test_tables_csv():

    ## Define schema
    cols = []
    cols.append(Column(name='Name', columnType='STRING'))
    cols.append(Column(name='Born', columnType='INTEGER'))
    cols.append(Column(name='Hipness', columnType='DOUBLE'))
    cols.append(Column(name='Living', columnType='BOOLEAN'))

    schema = Schema(name='Jazz Guys', columns=cols, parent=project)

    data = [["John Coltrane", 1926, 8.65, False],
            ["Miles Davis", 1926, 9.87, False],
            ["Bill Evans", 1929, 7.65, False],
            ["Paul Chambers", 1935, 5.14, False],
            ["Jimmy Cobb", 1929, 5.78, True],
            ["Scott LaFaro", 1936, 4.21, False],
            ["Sonny Rollins", 1930, 8.99, True],
            ["Kenny Burrel", 1931, 4.37, True]]

    ## the following creates a CSV file and uploads it to create a new table
    table = syn.store(Table(schema, data))

    ## Query and download an identical CSV
    results = syn.tableQuery("select * from %s" % table.schema.id,
                             resultsAs="csv",
                             includeRowIdAndRowVersion=False)

    ## Test that CSV file came back as expected
    for expected_row, row in zip(data, results):
        assert expected_row == row, "expected %s but got %s" % (expected_row,
                                                                row)

    try:
        ## check if we have pandas
        import pandas as pd

        df = results.asDataFrame()
        assert all(df.columns.values == ['Name', 'Born', 'Hipness', 'Living'])
        assert list(df.iloc[1, [0, 1, 3]]) == ['Miles Davis', 1926, False]
        assert df.iloc[1, 2] - 9.87 < 0.0001
    except ImportError as e1:
        sys.stderr.write(
            'Pandas is apparently not installed, skipping test of .asDataFrame for CSV tables.\n\n'
        )

    ## Aggregate query
    expected = {True: [True, 1929, 3, 6.38], False: [False, 1926, 5, 7.104]}

    results = syn.tableQuery(
        'select Living, min(Born), count(Living), avg(Hipness) from %s group by Living'
        % table.schema.id,
        resultsAs="csv",
        includeRowIdAndRowVersion=False)
    for row in results:
        living = row[0]
        assert expected[living][1] == row[1]
        assert expected[living][2] == row[2]
        assert abs(expected[living][3] - row[3]) < 0.0001

    ## Aggregate query results to DataFrame
    try:
        ## check if we have pandas
        import pandas as pd

        df = results.asDataFrame()
        assert all(expected[df.iloc[0, 0]][0:3] == df.iloc[0, 0:3])
        assert abs(expected[df.iloc[1, 0]][3] - df.iloc[1, 3]) < 0.0001
    except ImportError as e1:
        sys.stderr.write(
            'Pandas is apparently not installed, skipping test of .asDataFrame for aggregate queries as CSV tables.\n\n'
        )

    ## Append rows
    more_jazz_guys = [["Sonny Clark", 1931, 8.43, False],
                      ["Hank Mobley", 1930, 5.67, False],
                      ["Freddie Hubbard", 1938,
                       float('nan'), False],
                      ["Thelonious Monk", 1917,
                       float('inf'), False]]
    table = syn.store(Table(table.schema, more_jazz_guys))

    ## test that CSV file now has more jazz guys
    results = syn.tableQuery("select * from %s" % table.schema.id,
                             resultsAs="csv")
    for expected_row, row in zip(data + more_jazz_guys, results):
        for field, expected_field in zip(row[2:], expected_row):
            if type(field) is float and math.isnan(field):
                assert type(expected_field) is float and math.isnan(
                    expected_field)
            elif type(expected_field) is float and math.isnan(expected_field):
                assert type(field) is float and math.isnan(field)
            else:
                assert expected_field == field

    ## Update as a RowSet
    rowset = results.asRowSet()
    for row in rowset['rows']:
        if row['values'][1] == 1930:
            row['values'][2] = 8.5
    row_reference_set = syn.store(rowset)

    ## aggregate queries won't return row id and version, so we need to
    ## handle this correctly
    results = syn.tableQuery(
        'select Born, COUNT(*) from %s group by Born order by Born' %
        table.schema.id,
        resultsAs="csv")
    assert results.includeRowIdAndRowVersion == False
    for i, row in enumerate(results):
        assert row[0] == [1917, 1926, 1929, 1930, 1931, 1935, 1936, 1938][i]
        assert row[1] == [1, 2, 2, 2, 2, 1, 1, 1][i]

    try:
        import pandas as pd
        results = syn.tableQuery("select * from %s where Born=1930" %
                                 table.schema.id,
                                 resultsAs="csv")
        df = results.asDataFrame()
        all(df['Born'].values == 1930)
        all(df['Hipness'].values == 8.5)

        ## Update via a Data Frame
        df['Hipness'] = 9.75
        table = syn.store(Table(table.tableId, df, etag=results.etag))

        results = syn.tableQuery("select * from %s where Born=1930" %
                                 table.tableId,
                                 resultsAs="csv")
        for row in results:
            assert row[4] == 9.75
    except ImportError as e1:
        sys.stderr.write(
            'Pandas is apparently not installed, skipping part of test_tables_csv.\n\n'
        )

    ## check what happens when query result is empty
    results = syn.tableQuery('select * from %s where Born=2013' %
                             table.tableId,
                             resultsAs="csv")
    assert len(list(results)) == 0

    try:
        import pandas as pd
        results = syn.tableQuery('select * from %s where Born=2013' %
                                 table.tableId,
                                 resultsAs="csv")
        df = results.asDataFrame()
        assert df.shape[0] == 0
    except ImportError as e1:
        sys.stderr.write(
            'Pandas is apparently not installed, skipping part of test_tables_csv.\n\n'
        )

    ## delete some rows
    results = syn.tableQuery('select * from %s where Hipness < 7' %
                             table.tableId,
                             resultsAs="csv")
    syn.delete(results)
Ejemplo n.º 29
0
def test_rowset_tables():
    cols = []
    cols.append(Column(name='name', columnType='STRING', maximumSize=1000))
    cols.append(
        Column(name='foo',
               columnType='STRING',
               enumValues=['foo', 'bar', 'bat']))
    cols.append(Column(name='x', columnType='DOUBLE'))
    cols.append(Column(name='age', columnType='INTEGER'))
    cols.append(Column(name='cartoon', columnType='BOOLEAN'))
    cols.append(Column(name='description', columnType='LARGETEXT'))

    schema1 = syn.store(Schema(name='Foo Table', columns=cols, parent=project))

    ## Get columns associated with the given table
    retrieved_cols = list(syn.getTableColumns(schema1))

    ## Test that the columns we get are the same as the ones we stored
    assert len(retrieved_cols) == len(cols)
    for retrieved_col, col in zip(retrieved_cols, cols):
        assert retrieved_col.name == col.name
        assert retrieved_col.columnType == col.columnType

    data1 = [['Chris', 'bar', 11.23, 45, False, 'a'],
             ['Jen', 'bat', 14.56, 40, False, 'b'],
             ['Jane', 'bat', 17.89, 6, False, 'c' * 1002],
             ['Henry', 'bar', 10.12, 1, False, 'd']]
    row_reference_set1 = syn.store(
        RowSet(schema=schema1, rows=[Row(r) for r in data1]))
    assert len(row_reference_set1['rows']) == 4

    ## add more new rows
    data2 = [['Fred', 'bat', 21.45, 20, True, 'e'],
             ['Daphne', 'foo', 27.89, 20, True, 'f'],
             ['Shaggy', 'foo', 23.45, 20, True, 'g'],
             ['Velma', 'bar', 25.67, 20, True, 'h']]
    syn.store(RowSet(schema=schema1, rows=[Row(r) for r in data2]))

    results = syn.tableQuery("select * from %s order by name" % schema1.id,
                             resultsAs="rowset")

    assert results.count == 8
    assert results.tableId == schema1.id

    ## test that the values made the round trip
    expected = sorted(data1 + data2)
    for expected_values, row in zip(expected, results):
        assert expected_values == row['values'], 'got %s but expected %s' % (
            row['values'], expected_values)

    ## To modify rows, we have to select then first.
    result2 = syn.tableQuery('select * from %s where age>18 and age<30' %
                             schema1.id,
                             resultsAs="rowset")

    ## make a change
    rs = result2.asRowSet()
    for row in rs['rows']:
        row['values'][2] = 88.888

    ## store it
    row_reference_set = syn.store(rs)

    ## check if the change sticks
    result3 = syn.tableQuery('select name, x, age from %s' % schema1.id,
                             resultsAs="rowset")
    for row in result3:
        if int(row['values'][2]) == 20:
            assert row['values'][1] == 88.888

    ## Add a column
    bday_column = syn.store(Column(name='birthday', columnType='DATE'))

    column = syn.getColumn(bday_column.id)
    assert column.name == "birthday"
    assert column.columnType == "DATE"

    schema1.addColumn(bday_column)
    schema1 = syn.store(schema1)

    results = syn.tableQuery(
        'select * from %s where cartoon=false order by age' % schema1.id,
        resultsAs="rowset")
    rs = results.asRowSet()

    ## put data in new column
    bdays = ('2013-3-15', '2008-1-3', '1973-12-8', '1969-4-28')
    for bday, row in zip(bdays, rs.rows):
        row['values'][6] = bday
    row_reference_set = syn.store(rs)

    ## query by date and check that we get back two kids
    date_2008_jan_1 = utils.to_unix_epoch_time(datetime(2008, 1, 1))
    results = syn.tableQuery(
        'select name from %s where birthday > %d order by birthday' %
        (schema1.id, date_2008_jan_1),
        resultsAs="rowset")
    assert ["Jane", "Henry"] == [row['values'][0] for row in results]

    try:
        import pandas as pd
        df = results.asDataFrame()
        assert all(df.ix[:, "name"] == ["Jane", "Henry"])
    except ImportError as e1:
        sys.stderr.write(
            'Pandas is apparently not installed, skipping part of test_rowset_tables.\n\n'
        )

    results = syn.tableQuery(
        'select birthday from %s where cartoon=false order by age' %
        schema1.id,
        resultsAs="rowset")
    for bday, row in zip(bdays, results):
        assert row['values'][0] == datetime.strptime(
            bday,
            "%Y-%m-%d"), "got %s but expected %s" % (row['values'][0], bday)

    try:
        import pandas as pd
        results = syn.tableQuery(
            "select foo, MAX(x), COUNT(foo), MIN(age) from %s group by foo order by foo"
            % schema1.id,
            resultsAs="rowset")
        df = results.asDataFrame()
        assert df.shape == (3, 4)
        assert all(df.iloc[:, 0] == ["bar", "bat", "foo"])
        assert all(df.iloc[:, 1] == [88.888, 88.888, 88.888])
        assert all(df.iloc[:, 2] == [3, 3, 2])
    except ImportError as e1:
        sys.stderr.write(
            'Pandas is apparently not installed, skipping part of test_rowset_tables.\n\n'
        )

    ## test delete rows by deleting cartoon characters
    syn.delete(
        syn.tableQuery('select name from %s where cartoon = true' % schema1.id,
                       resultsAs="rowset"))

    results = syn.tableQuery('select name from %s order by birthday' %
                             schema1.id,
                             resultsAs="rowset")
    assert ["Chris", "Jen", "Jane",
            "Henry"] == [row['values'][0] for row in results]

    ## check what happens when query result is empty
    results = syn.tableQuery('select * from %s where age > 1000' % schema1.id,
                             resultsAs="rowset")
    assert len(list(results)) == 0

    try:
        import pandas as pd
        results = syn.tableQuery('select * from %s where age > 1000' %
                                 schema1.id,
                                 resultsAs="rowset")
        df = results.asDataFrame()
        assert df.shape[0] == 0
    except ImportError as e1:
        sys.stderr.write(
            'Pandas is apparently not installed, skipping part of test_rowset_tables.\n\n'
        )
Ejemplo n.º 30
0
def files_to_synapse_table(in_files,
                           synapse_project_id,
                           table_name,
                           column_name='fileID',
                           username='',
                           password=''):
    """
    Upload files and file handle IDs to Synapse.

    Parameters
    ----------
    in_files : list of strings
        paths to files to upload to Synapse
    synapse_project_id : string
        Synapse ID for project to which table is to be written
    table_name : string
        schema name of table
    column_name : string
        header for column of fileIDs
    username : string
        Synapse username (only needed once on a given machine)
    password : string
        Synapse password (only needed once on a given machine)

    Returns
    -------
    synapse_project_id : string
        Synapse ID for project

    Examples
    --------
    >>> from mhealthx.io_data import files_to_synapse_table
    >>> in_files = ['/Users/arno/Local/wav/test1.wav']
    >>> synapse_project_id = 'syn4899451'
    >>> table_name = 'Test to store files and file handle IDs'
    >>> column_name = 'fileID1'
    >>> username = ''
    >>> password = ''
    >>> table_data, synapse_project_id = files_to_synapse_table(in_files, synapse_project_id, table_name, column_name, username, password)
    >>> #column_name = 'fileID2'
    >>> #in_files = ['/Users/arno/Local/wav/test2.wav']
    >>> #table_data, synapse_project_id = files_to_synapse_table(in_files, synapse_project_id, table_name, column_name, username, password)

    """
    import synapseclient
    from synapseclient import Schema
    from synapseclient.table import Column, RowSet, Row

    syn = synapseclient.Synapse()

    # Log in to Synapse:
    if username and password:
        syn.login(username, password)
    else:
        syn.login()

    # Store file handle IDs:
    files_handles = []
    for in_file in in_files:
        file_handle = syn._chunkedUploadFile(in_file)
        files_handles.append([file_handle['id']])

    # New column headers:
    new_column_header = Column(name=column_name, columnType='FILEHANDLEID')

    # See if Synapse table exists:
    # tex = list(syn.chunkedQuery("select id from Table where parentId=='{0}'"
    #                             " and name=='{1}'".format(synapse_project_id,
    #                                                       table_name)))
    # If Synapse table does not exist, create table schema:
    # if not tex:

    # Create table schema:
    schema = syn.store(
        Schema(name=table_name,
               columns=[new_column_header],
               parent=synapse_project_id))

    # Upload files and file handle IDs with new schema:
    syn.store(
        RowSet(columns=[new_column_header],
               schema=schema,
               rows=[Row(r) for r in files_handles]))