def test_EntityViewSchema__ignore_annotation_column_names(): syn = synapseclient.client.Synapse(debug=True, skip_checks=True) scopeIds = ['123'] entity_view = EntityViewSchema("someName", scopes=scopeIds, parent="syn123", ignoredAnnotationColumnNames={'long1'}, addDefaultViewColumns=False, addAnnotationColumns=True) mocked_annotation_result1 = [ Column(name='long1', columnType='INTEGER'), Column(name='long2', columnType='INTEGER') ] with patch.object(syn, '_get_annotation_entity_view_columns', return_value=mocked_annotation_result1) as mocked_get_annotations,\ patch.object(syn, 'getColumns') as mocked_get_columns,\ patch.object(SchemaBase, "_before_synapse_store"): entity_view._before_synapse_store(syn) mocked_get_columns.assert_called_once_with([]) mocked_get_annotations.assert_called_once_with(scopeIds, 'file') assert_equals([Column(name='long2', columnType='INTEGER')], entity_view.columns_to_store)
def _view_setup(cls): # set up a file view folder = syn.store( Folder(name="PartialRowTestFolder" + str(uuid.uuid4()), parent=project)) syn.store( File("~/path/doesnt/matter", name="f1", parent=folder, synapseStore=False)) syn.store( File("~/path/doesnt/matter/again", name="f2", parent=folder, synapseStore=False)) cols = [ Column(name='foo', columnType='STRING', maximumSize=1000), Column(name='bar', columnType='STRING') ] return syn.store( EntityViewSchema(name='PartialRowTestViews' + str(uuid.uuid4()), columns=cols, addDefaultViewColumns=False, parent=project, scopes=[folder]))
def dontruntest_big_tables(): cols = [] cols.append(Column(name='name', columnType='STRING', maximumSize=1000)) cols.append( Column(name='foo', columnType='STRING', enumValues=['foo', 'bar', 'bat'])) cols.append(Column(name='x', columnType='DOUBLE')) cols.append(Column(name='n', columnType='INTEGER')) cols.append(Column(name='is_bogus', columnType='BOOLEAN')) table1 = syn.store(Schema(name='Big Table', columns=cols, parent=project)) rows_per_append = 10 for i in range(1000): rows = [] for j in range(rows_per_append): foo = cols[1].enumValues[random.randint(0, 2)] rows.append( Row(('Robot ' + str(i * rows_per_append + j), foo, random.random() * 200.0, random.randint(0, 100), random.random() >= 0.5))) rowset1 = syn.store(RowSet(columns=cols, schema=table1, rows=rows)) results = syn.tableQuery("select * from %s" % table1.id) results = syn.tableQuery( "select n, COUNT(n), MIN(x), AVG(x), MAX(x), SUM(x) from %s group by n" % table1.id) df = results.asDataFrame()
def test_list_of_rows_table(): data = [["John Coltrane", 1926, 8.65, False], ["Miles Davis", 1926, 9.87, False], ["Bill Evans", 1929, 7.65, False], ["Paul Chambers", 1935, 5.14, False], ["Jimmy Cobb", 1929, 5.78, True], ["Scott LaFaro", 1936, 4.21, False], ["Sonny Rollins", 1930, 8.99, True], ["Kenny Burrel", 1931, 4.37, True]] cols = [Column(id='1', name='Name', columnType='STRING'), Column(id='2', name='Born', columnType='INTEGER'), Column(id='3', name='Hipness', columnType='DOUBLE'), Column(id='4', name='Living', columnType='BOOLEAN')] schema1 = Schema(name='Jazz Guys', columns=cols, id="syn1000002", parent="syn1000001") # need columns to do cast_values w/o storing table = Table(schema1, data, headers=[SelectColumn.from_column(col) for col in cols]) for table_row, expected_row in zip(table, data): assert table_row == expected_row rowset = table.asRowSet() for rowset_row, expected_row in zip(rowset.rows, data): assert rowset_row['values'] == expected_row table.columns = cols df = table.asDataFrame() assert list(df['Name']) == [r[0] for r in data]
def test_build_table_download_file_handle_list__repeated_file_handles(): syn = synapseclient.client.Synapse(debug=True, skip_checks=True) #patch the cache so we don't look there in case FileHandle ids actually exist there patch.object(syn.cache, "get", return_value=None) cols = [ Column(name='Name', columnType='STRING', maximumSize=50), Column(name='filehandle', columnType='FILEHANDLEID') ] schema = Schema(name='FileHandleTest', columns=cols, parent='syn420') #using some large filehandle numbers so i don data = [["ayy lmao", 5318008], ["large numberino", 0x5f3759df], ["repeated file handle", 5318008], ["repeated file handle also", 0x5f3759df]] ## need columns to do cast_values w/o storing table = Table(schema, data, headers=[SelectColumn.from_column(col) for col in cols]) file_handle_associations, file_handle_to_path_map = syn._build_table_download_file_handle_list( table, ['filehandle']) #verify only 2 file_handles are added (repeats were ignored) assert_equals(2, len(file_handle_associations)) assert_equals(0, len(file_handle_to_path_map)) #might as well check anyways
def test_synapse_integer_columns_with_missing_values_from_dataframe(): #SYNPY-267 cols = [ Column(name='x', columnType='STRING'), Column(name='y', columnType='INTEGER'), Column(name='z', columnType='DOUBLE') ] schema = syn.store(Schema(name='Big Table', columns=cols, parent=project)) ## write rows to CSV file with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as temp: schedule_for_cleanup(temp.name) #2nd row is missing a value in its integer column temp.write('x,y,z\na,1,0.9\nb,,0.8\nc,3,0.7\n') temp.flush() filename = temp.name #create a table from csv table = Table(schema, filename) df = table.asDataFrame() table_from_dataframe = Table(schema, df) assert_not_equal(table.filepath, table_from_dataframe.filepath) #compare to make sure no .0's were appended to the integers assert filecmp.cmp(table.filepath, table_from_dataframe.filepath)
def test_synapse_integer_columns_with_missing_values_from_dataframe(): # SYNPY-267 cols = [ Column(name='x', columnType='STRING'), Column(name='y', columnType='INTEGER'), Column(name='z', columnType='DOUBLE') ] schema = syn.store(Schema(name='Big Table', columns=cols, parent=project)) line_terminator = str(os.linesep) # write rows to CSV file with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as temp: schedule_for_cleanup(temp.name) # 2nd row is missing a value in its integer column temp.write('x,y,z' + line_terminator + 'a,1,0.9' + line_terminator + 'b,,0.8' + line_terminator + 'c,3,0.7' + line_terminator) temp.flush() filename = temp.name # create a table from csv table = Table(schema, filename) df = table.asDataFrame() table_from_dataframe = Table(schema, df) assert_not_equal(table.filepath, table_from_dataframe.filepath) df2 = table_from_dataframe.asDataFrame() assert_frame_equal(df, df2)
def dontruntest_big_csvs(): cols = [ Column(name='name', columnType='STRING', maximumSize=1000), Column(name='foo', columnType='STRING', enumValues=['foo', 'bar', 'bat']), Column(name='x', columnType='DOUBLE'), Column(name='n', columnType='INTEGER'), Column(name='is_bogus', columnType='BOOLEAN') ] schema1 = syn.store(Schema(name='Big Table', columns=cols, parent=project)) # write rows to CSV file with tempfile.NamedTemporaryFile(delete=False) as temp: schedule_for_cleanup(temp.name) filename = temp.name with io.open(filename, mode='w', encoding="utf-8", newline='') as temp: writer = csv.writer(temp, quoting=csv.QUOTE_NONNUMERIC, lineterminator=str(os.linesep)) writer.writerow([col.name for col in cols]) for i in range(10): for j in range(100): foo = cols[1].enumValues[random.randint(0, 2)] writer.writerow( ('Robot ' + str(i * 100 + j), foo, random.random() * 200.0, random.randint(0, 100), random.random() >= 0.5)) # upload CSV syn._uploadCsv(filepath=temp.name, schema=schema1) from synapseclient.table import CsvFileTable CsvFileTable.from_table_query(syn, "select * from %s" % schema1.id)
def test_tables_csv(): # Define schema cols = [ Column(name='Name', columnType='STRING'), Column(name='Born', columnType='INTEGER'), Column(name='Hipness', columnType='DOUBLE'), Column(name='Living', columnType='BOOLEAN') ] schema = Schema(name='Jazz Guys', columns=cols, parent=project) data = [["John Coltrane", 1926, 8.65, False], ["Miles Davis", 1926, 9.87, False], ["Bill Evans", 1929, 7.65, False], ["Paul Chambers", 1935, 5.14, False], ["Jimmy Cobb", 1929, 5.78, True], ["Scott LaFaro", 1936, 4.21, False], ["Sonny Rollins", 1930, 8.99, True], ["Kenny Burrel", 1931, 4.37, True]] # the following creates a CSV file and uploads it to create a new table table = syn.store(Table(schema, data)) # Query and download an identical CSV results = syn.tableQuery("select * from %s" % table.schema.id, resultsAs="csv", includeRowIdAndRowVersion=False) # Test that CSV file came back as expected for expected_row, row in zip(data, results): assert_equals(expected_row, row, "expected %s but got %s" % (expected_row, row))
def test_input_is_SchemaBase(self): get_table_colums_results = [Column(name='A'), Column(name='B')] with patch.object(syn, "getTableColumns", return_value=iter(get_table_colums_results))\ as mock_get_table_coulmns: schema = EntityViewSchema(parentId="syn123") results = list(syn.getColumns(schema)) assert_equal(get_table_colums_results, results) mock_get_table_coulmns.assert_called_with(schema)
def test_table_file_view_csv_update_annotations__includeEntityEtag(): folder = syn.store( synapseclient.Folder(name="updateAnnoFolder" + str(uuid.uuid4()), parent=project)) anno1_name = "annotationColumn1" anno2_name = "annotationColumn2" initial_annotations = { anno1_name: "initial_value1", anno2_name: "initial_value2" } file_entity = syn.store( File(name= "test_table_file_view_csv_update_annotations__includeEntityEtag", path="~/fakepath", synapseStore=False, parent=folder, annotations=initial_annotations)) annotation_columns = [ Column(name=anno1_name, columnType='STRING'), Column(name=anno2_name, columnType='STRING') ] entity_view = syn.store( EntityViewSchema(name="TestEntityViewSchemaUpdateAnnotation" + str(uuid.uuid4()), parent=project, scopes=[folder], columns=annotation_columns)) query_str = "SELECT {anno1}, {anno2} FROM {proj_id}".format( anno1=anno1_name, anno2=anno2_name, proj_id=utils.id_of(entity_view)) #modify first annotation using rowset rowset_query_result = syn.tableQuery(query_str, resultsAs="rowset") rowset = rowset_query_result.asRowSet() rowset_changed_anno_value = "rowset_value_change" rowset.rows[0].values[0] = rowset_changed_anno_value syn.store(rowset) #modify second annotation using csv csv_query_result = syn.tableQuery(query_str, resultsAs="csv") dataframe = csv_query_result.asDataFrame() csv_changed_anno_value = "csv_value_change" dataframe.ix[0, anno2_name] = csv_changed_anno_value syn.store(Table(utils.id_of(entity_view), dataframe)) #check annotations in the file entity. Annotations may not be immediately updated so we wait in while loop expected_annotations = { anno1_name: [rowset_changed_anno_value], anno2_name: [csv_changed_anno_value] } start_time = time.time() while (expected_annotations != file_entity.annotations): assert_less(time.time() - start_time, QUERY_TIMEOUT_SEC) time.sleep(2) file_entity = syn.get(file_entity, downloadFile=False)
def _table_setup(cls): # set up a table cols = [ Column(name='foo', columnType='INTEGER'), Column(name='bar', columnType='INTEGER') ] schema = syn.store( Schema(name='PartialRowTest' + str(uuid.uuid4()), columns=cols, parent=project)) data = [[1, None], [None, 2]] syn.store(RowSet(schema=schema, rows=[Row(r) for r in data])) return schema
def _table_setup(cls): # set up a table cols = [ Column(name='foo', columnType='STRING', maximumSize=1000), Column(name='bar', columnType='STRING') ] schema = syn.store( Schema(name='PartialRowTest' + str(uuid.uuid4()), columns=cols, parent=project)) data = [['foo1', None], [None, 'bar2']] syn.store(RowSet(schema=schema, rows=[Row(r) for r in data])) return schema
def test_download_table_files(): cols = [ Column(name='artist', columnType='STRING', maximumSize=50), Column(name='album', columnType='STRING', maximumSize=50), Column(name='year', columnType='INTEGER'), Column(name='catalog', columnType='STRING', maximumSize=50), Column(name='cover', columnType='FILEHANDLEID') ] schema = syn.store(Schema(name='Jazz Albums', columns=cols, parent=project)) schedule_for_cleanup(schema) data = [[ "John Coltrane", "Blue Train", 1957, "BLP 1577", "coltraneBlueTrain.jpg" ], ["Sonny Rollins", "Vol. 2", 1957, "BLP 1558", "rollinsBN1558.jpg"], [ "Sonny Rollins", "Newk's Time", 1958, "BLP 4001", "rollinsBN4001.jpg" ], [ "Kenny Burrel", "Kenny Burrel", 1956, "BLP 1543", "burrellWarholBN1543.jpg" ]] ## upload files and store file handle ids original_files = [] for row in data: path = utils.make_bogus_data_file() original_files.append(path) schedule_for_cleanup(path) file_handle = syn._chunkedUploadFile(path) row[4] = file_handle['id'] row_reference_set = syn.store( RowSet(columns=cols, schema=schema, rows=[Row(r) for r in data])) ## retrieve the files for each row and verify that they are identical to the originals results = syn.tableQuery( 'select artist, album, year, catalog, cover from %s' % schema.id, resultsAs="rowset") for i, row in enumerate(results): print "%s_%s" % (row.rowId, row.versionNumber), row.values file_info = syn.downloadTableFile(results, rowId=row.rowId, versionNumber=row.versionNumber, column='cover', downloadLocation='.') assert filecmp.cmp(original_files[i], file_info['path']) schedule_for_cleanup(file_info['path'])
def test_EntityViewSchema__repeated_columnName_same_type(syn): syn = Synapse(debug=True, skip_checks=True) entity_view = EntityViewSchema("someName", parent="syn123") columns = [Column(name='annoName', columnType='INTEGER'), Column(name='annoName', columnType='INTEGER')] with patch.object(syn, 'getColumns') as mocked_get_columns: filtered_results = entity_view._filter_duplicate_columns(syn, columns) mocked_get_columns.assert_called_once_with([]) assert 1 == len(filtered_results) assert Column(name='annoName', columnType='INTEGER') == filtered_results[0]
def test_list_of_rows_table(): data = [["John Coltrane", 1926, 8.65, False], ["Miles Davis", 1926, 9.87, False], ["Bill Evans", 1929, 7.65, False], ["Paul Chambers", 1935, 5.14, False], ["Jimmy Cobb", 1929, 5.78, True], ["Scott LaFaro", 1936, 4.21, False], ["Sonny Rollins", 1930, 8.99, True], ["Kenny Burrel", 1931, 4.37, True]] cols = [] cols.append(Column(id='1', name='Name', columnType='STRING')) cols.append(Column(id='2', name='Born', columnType='INTEGER')) cols.append(Column(id='3', name='Hipness', columnType='DOUBLE')) cols.append(Column(id='4', name='Living', columnType='BOOLEAN')) schema1 = Schema(name='Jazz Guys', columns=cols, id="syn1000002", parent="syn1000001") ## need columns to do cast_values w/o storing table = Table(schema1, data, headers=[SelectColumn.from_column(col) for col in cols]) for table_row, expected_row in zip(table, data): assert table_row == expected_row rowset = table.asRowSet() for rowset_row, expected_row in zip(rowset.rows, data): assert rowset_row['values'] == expected_row table.columns = cols ## test asDataFrame try: import pandas as pd df = table.asDataFrame() assert all(df['Name'] == [r[0] for r in data]) except ImportError as e1: sys.stderr.write( 'Pandas is apparently not installed, skipping asDataFrame portion of test_list_of_rows_table.\n\n' )
def test_RowSetTable_len(): schema = Schema(parentId="syn123", id='syn456', columns=[Column(name='column_name', id='123')]) rowset = RowSet(schema=schema, rows=[Row(['first row']), Row(['second row'])]) row_set_table = RowSetTable(schema, rowset) assert_equals(2, len(row_set_table))
def dontruntest_big_csvs(): cols = [] cols.append(Column(name='name', columnType='STRING', maximumSize=1000)) cols.append( Column(name='foo', columnType='STRING', enumValues=['foo', 'bar', 'bat'])) cols.append(Column(name='x', columnType='DOUBLE')) cols.append(Column(name='n', columnType='INTEGER')) cols.append(Column(name='is_bogus', columnType='BOOLEAN')) schema1 = syn.store(Schema(name='Big Table', columns=cols, parent=project)) print "Created table:", schema1.id print "with columns:", schema1.columnIds ## write rows to CSV file with tempfile.NamedTemporaryFile(delete=False) as temp: schedule_for_cleanup(temp.name) writer = csv.writer(temp, quoting=csv.QUOTE_NONNUMERIC, lineterminator=os.linesep) writer.writerow([col.name for col in cols]) for i in range(10): for j in range(100): foo = cols[1].enumValues[random.randint(0, 2)] writer.writerow( ('Robot ' + str(i * 100 + j), foo, random.random() * 200.0, random.randint(0, 100), random.random() >= 0.5)) print "wrote 100 rows to disk" ## upload CSV UploadToTableResult = syn._uploadCsv(filepath=temp.name, schema=schema1) from synapseclient.table import CsvFileTable results = CsvFileTable.from_table_query(syn, "select * from %s" % schema1.id) print "etag:", results.etag print "tableId:", results.tableId for row in results: print row
def test_build_table__with_csv(): string_io = StringIOContextManager('a,b\n' '1,c\n' '2,d\n' '3,e') with patch.object(synapseclient.table, "as_table_columns", return_value=[Column(name="a", columnType="INTEGER"), Column(name="b", columnType="STRING")]),\ patch.object(io, "open", return_value=string_io): table = build_table("test", "syn123", "some_file_name") for col, row in enumerate(table): assert_equals(row[0], (col + 1)) assert_equals(row[1], ["c", "d", "e"][col]) assert_equals(len(table), 3) headers = [{ 'name': 'a', 'columnType': 'INTEGER' }, { 'name': 'b', 'columnType': 'STRING' }] assert_equals(headers, table.headers)
def test_EntityViewSchema__repeated_columnName_different_type(): syn = synapseclient.client.Synapse(debug=True, skip_checks=True) scopeIds = ['123'] entity_view = EntityViewSchema("someName", scopes=scopeIds, parent="syn123") columns = [ Column(name='annoName', columnType='INTEGER'), Column(name='annoName', columnType='DOUBLE') ] with patch.object(syn, 'getColumns') as mocked_get_columns: filtered_results = entity_view._filter_duplicate_columns(syn, columns) mocked_get_columns.assert_called_once_with([]) assert_equals(2, len(filtered_results)) assert_equals(columns, filtered_results)
def test_EntityViewSchema__repeated_columnName(): syn = synapseclient.client.Synapse(debug=True, skip_checks=True) scopeIds = ['123'] entity_view = EntityViewSchema("someName", scopes=scopeIds, parent="syn123") mocked_annotation_result1 = [ Column(name='annoName', columnType='INTEGER'), Column(name='annoName', columnType='DOUBLE') ] with patch.object(syn, '_get_annotation_entity_view_columns', return_value=mocked_annotation_result1) as mocked_get_annotations,\ patch.object(syn, 'getColumns') as mocked_get_columns: assert_raises(ValueError, entity_view._add_annotations_as_columns, syn) mocked_get_columns.assert_called_once_with([]) mocked_get_annotations.assert_called_once_with(scopeIds, 'file')
def test_store_table_datetime(): current_datetime = datetime.fromtimestamp(round(time.time(), 3)) schema = syn.store( Schema("testTable", [Column(name="testerino", columnType='DATE')], project)) rowset = RowSet(rows=[Row([current_datetime])], schema=schema) rowset_table = syn.store(Table(schema, rowset)) query_result = syn.tableQuery("select * from %s" % id_of(schema), resultsAs="rowset") assert_equals(current_datetime, query_result.rowset['rows'][0]['values'][0])
def test_rowset_tables(): cols = [ Column(name='name', columnType='STRING', maximumSize=1000), Column(name='foo', columnType='STRING', enumValues=['foo', 'bar', 'bat']), Column(name='x', columnType='DOUBLE'), Column(name='age', columnType='INTEGER'), Column(name='cartoon', columnType='BOOLEAN'), Column(name='description', columnType='LARGETEXT') ] schema1 = syn.store(Schema(name='Foo Table', columns=cols, parent=project)) data1 = [['Chris', 'bar', 11.23, 45, False, 'a'], ['Jen', 'bat', 14.56, 40, False, 'b'], ['Jane', 'bat', 17.89, 6, False, 'c' * 1002], ['Henry', 'bar', 10.12, 1, False, 'd']] row_reference_set1 = syn.store( RowSet(schema=schema1, rows=[Row(r) for r in data1])) assert_equals(len(row_reference_set1['rows']), 4)
def test_csv_table(): ## Maybe not truly a unit test, but here because it doesn't do ## network IO to synapse data = [["1", "1", "John Coltrane", 1926, 8.65, False], ["2", "1", "Miles Davis", 1926, 9.87, False], ["3", "1", "Bill Evans", 1929, 7.65, False], ["4", "1", "Paul Chambers", 1935, 5.14, False], ["5", "1", "Jimmy Cobb", 1929, 5.78, True], ["6", "1", "Scott LaFaro", 1936, 4.21, False], ["7", "1", "Sonny Rollins", 1930, 8.99, True], ["8", "1", "Kenny Burrel", 1931, 4.37, True]] filename = None cols = [] cols.append(Column(id='1', name='Name', columnType='STRING')) cols.append(Column(id='2', name='Born', columnType='INTEGER')) cols.append(Column(id='3', name='Hipness', columnType='DOUBLE')) cols.append(Column(id='4', name='Living', columnType='BOOLEAN')) schema1 = Schema(id='syn1234', name='Jazz Guys', columns=cols, parent="syn1000001") #TODO: use StringIO.StringIO(data) rather than writing files try: ## create CSV file with tempfile.NamedTemporaryFile(delete=False) as temp: filename = temp.name with io.open(filename, mode='w', encoding="utf-8", newline='') as temp: writer = csv.writer(temp, quoting=csv.QUOTE_NONNUMERIC, lineterminator=str(os.linesep)) headers = ['ROW_ID', 'ROW_VERSION'] + [col.name for col in cols] writer.writerow(headers) for row in data: writer.writerow(row) table = Table(schema1, filename) assert isinstance(table, CsvFileTable) ## need to set column headers to read a CSV file table.setColumnHeaders([ SelectColumn(name="ROW_ID", columnType="STRING"), SelectColumn(name="ROW_VERSION", columnType="STRING") ] + [SelectColumn.from_column(col) for col in cols]) ## test iterator for table_row, expected_row in zip(table, data): assert table_row == expected_row ## test asRowSet rowset = table.asRowSet() for rowset_row, expected_row in zip(rowset.rows, data): assert rowset_row['values'] == expected_row[2:] assert rowset_row['rowId'] == expected_row[0] assert rowset_row['versionNumber'] == expected_row[1] ## test asDataFrame try: import pandas as pd df = table.asDataFrame() assert all(df['Name'] == [row[2] for row in data]) assert all(df['Born'] == [row[3] for row in data]) assert all(df['Living'] == [row[5] for row in data]) assert all(df.index == ['%s_%s' % tuple(row[0:2]) for row in data]) assert df.shape == (8, 4) except ImportError as e1: sys.stderr.write( 'Pandas is apparently not installed, skipping asDataFrame portion of test_csv_table.\n\n' ) except Exception as ex1: if filename: try: if os.path.isdir(filename): shutil.rmtree(filename) else: os.remove(filename) except Exception as ex: print(ex) raise
def test_Schema__max_column_check(): table = Schema(name="someName", parent="idk") table.addColumns( Column(name="colNum%s" % i, columnType="STRING") for i in range(synapseclient.table.MAX_NUM_TABLE_COLUMNS + 1)) assert_raises(ValueError, syn.store, table)
def test_schema(): schema = Schema(name='My Table', parent="syn1000001") assert not schema.has_columns() schema.addColumn(Column(id='1', name='Name', columnType='STRING')) assert schema.has_columns() assert schema.properties.columnIds == ['1'] schema.removeColumn('1') assert not schema.has_columns() assert schema.properties.columnIds == [] schema = Schema(name='Another Table', parent="syn1000001") schema.addColumns([ Column(name='Name', columnType='STRING'), Column(name='Born', columnType='INTEGER'), Column(name='Hipness', columnType='DOUBLE'), Column(name='Living', columnType='BOOLEAN') ]) assert schema.has_columns() assert len(schema.columns_to_store) == 4 assert Column(name='Name', columnType='STRING') in schema.columns_to_store assert Column(name='Born', columnType='INTEGER') in schema.columns_to_store assert Column(name='Hipness', columnType='DOUBLE') in schema.columns_to_store assert Column(name='Living', columnType='BOOLEAN') in schema.columns_to_store schema.removeColumn(Column(name='Living', columnType='BOOLEAN')) assert schema.has_columns() assert len(schema.columns_to_store) == 3 assert Column(name='Living', columnType='BOOLEAN') not in schema.columns_to_store assert Column(name='Hipness', columnType='DOUBLE') in schema.columns_to_store
def test_download_table_files(): cols = [ Column(name='artist', columnType='STRING', maximumSize=50), Column(name='album', columnType='STRING', maximumSize=50), Column(name='year', columnType='INTEGER'), Column(name='catalog', columnType='STRING', maximumSize=50), Column(name='cover', columnType='FILEHANDLEID') ] schema = syn.store(Schema(name='Jazz Albums', columns=cols, parent=project)) schedule_for_cleanup(schema) data = [[ "John Coltrane", "Blue Train", 1957, "BLP 1577", "coltraneBlueTrain.jpg" ], ["Sonny Rollins", "Vol. 2", 1957, "BLP 1558", "rollinsBN1558.jpg"], [ "Sonny Rollins", "Newk's Time", 1958, "BLP 4001", "rollinsBN4001.jpg" ], [ "Kenny Burrel", "Kenny Burrel", 1956, "BLP 1543", "burrellWarholBN1543.jpg" ]] ## upload files and store file handle ids original_files = [] for row in data: path = utils.make_bogus_data_file() original_files.append(path) schedule_for_cleanup(path) file_handle = syn.uploadFileHandle(path, project) row[4] = file_handle['id'] row_reference_set = syn.store( RowSet(schema=schema, rows=[Row(r) for r in data])) ## retrieve the files for each row and verify that they are identical to the originals results = syn.tableQuery( "select artist, album, 'year', 'catalog', cover from %s" % schema.id, resultsAs="rowset") for i, row in enumerate(results): path = syn.downloadTableFile(results, rowId=row.rowId, versionNumber=row.versionNumber, column='cover') assert filecmp.cmp(original_files[i], path) schedule_for_cleanup(path) ## test that cached copies are returned for already downloaded files original_downloadFile_method = syn._downloadFileHandle with patch( "synapseclient.Synapse._downloadFileHandle") as _downloadFile_mock: _downloadFile_mock.side_effect = original_downloadFile_method results = syn.tableQuery( "select artist, album, 'year', 'catalog', cover from %s where artist = 'John Coltrane'" % schema.id, resultsAs="rowset") for i, row in enumerate(results): file_path = syn.downloadTableFile(results, rowId=row.rowId, versionNumber=row.versionNumber, column='cover') assert filecmp.cmp(original_files[i], file_path) assert not _downloadFile_mock.called, "Should have used cached copy of file and not called _downloadFile" ## test download table column results = syn.tableQuery('select * from %s' % schema.id) ## uncache 2 out of 4 files for i, row in enumerate(results): if i % 2 == 0: syn.cache.remove(row[6]) file_map = syn.downloadTableColumns(results, ['cover']) assert len(file_map) == 4 for row in results: filecmp.cmp(original_files[i], file_map[row[6]])
def test_tables_csv(): ## Define schema cols = [] cols.append(Column(name='Name', columnType='STRING')) cols.append(Column(name='Born', columnType='INTEGER')) cols.append(Column(name='Hipness', columnType='DOUBLE')) cols.append(Column(name='Living', columnType='BOOLEAN')) schema = Schema(name='Jazz Guys', columns=cols, parent=project) data = [["John Coltrane", 1926, 8.65, False], ["Miles Davis", 1926, 9.87, False], ["Bill Evans", 1929, 7.65, False], ["Paul Chambers", 1935, 5.14, False], ["Jimmy Cobb", 1929, 5.78, True], ["Scott LaFaro", 1936, 4.21, False], ["Sonny Rollins", 1930, 8.99, True], ["Kenny Burrel", 1931, 4.37, True]] ## the following creates a CSV file and uploads it to create a new table table = syn.store(Table(schema, data)) ## Query and download an identical CSV results = syn.tableQuery("select * from %s" % table.schema.id, resultsAs="csv", includeRowIdAndRowVersion=False) ## Test that CSV file came back as expected for expected_row, row in zip(data, results): assert expected_row == row, "expected %s but got %s" % (expected_row, row) try: ## check if we have pandas import pandas as pd df = results.asDataFrame() assert all(df.columns.values == ['Name', 'Born', 'Hipness', 'Living']) assert list(df.iloc[1, [0, 1, 3]]) == ['Miles Davis', 1926, False] assert df.iloc[1, 2] - 9.87 < 0.0001 except ImportError as e1: sys.stderr.write( 'Pandas is apparently not installed, skipping test of .asDataFrame for CSV tables.\n\n' ) ## Aggregate query expected = {True: [True, 1929, 3, 6.38], False: [False, 1926, 5, 7.104]} results = syn.tableQuery( 'select Living, min(Born), count(Living), avg(Hipness) from %s group by Living' % table.schema.id, resultsAs="csv", includeRowIdAndRowVersion=False) for row in results: living = row[0] assert expected[living][1] == row[1] assert expected[living][2] == row[2] assert abs(expected[living][3] - row[3]) < 0.0001 ## Aggregate query results to DataFrame try: ## check if we have pandas import pandas as pd df = results.asDataFrame() assert all(expected[df.iloc[0, 0]][0:3] == df.iloc[0, 0:3]) assert abs(expected[df.iloc[1, 0]][3] - df.iloc[1, 3]) < 0.0001 except ImportError as e1: sys.stderr.write( 'Pandas is apparently not installed, skipping test of .asDataFrame for aggregate queries as CSV tables.\n\n' ) ## Append rows more_jazz_guys = [["Sonny Clark", 1931, 8.43, False], ["Hank Mobley", 1930, 5.67, False], ["Freddie Hubbard", 1938, float('nan'), False], ["Thelonious Monk", 1917, float('inf'), False]] table = syn.store(Table(table.schema, more_jazz_guys)) ## test that CSV file now has more jazz guys results = syn.tableQuery("select * from %s" % table.schema.id, resultsAs="csv") for expected_row, row in zip(data + more_jazz_guys, results): for field, expected_field in zip(row[2:], expected_row): if type(field) is float and math.isnan(field): assert type(expected_field) is float and math.isnan( expected_field) elif type(expected_field) is float and math.isnan(expected_field): assert type(field) is float and math.isnan(field) else: assert expected_field == field ## Update as a RowSet rowset = results.asRowSet() for row in rowset['rows']: if row['values'][1] == 1930: row['values'][2] = 8.5 row_reference_set = syn.store(rowset) ## aggregate queries won't return row id and version, so we need to ## handle this correctly results = syn.tableQuery( 'select Born, COUNT(*) from %s group by Born order by Born' % table.schema.id, resultsAs="csv") assert results.includeRowIdAndRowVersion == False for i, row in enumerate(results): assert row[0] == [1917, 1926, 1929, 1930, 1931, 1935, 1936, 1938][i] assert row[1] == [1, 2, 2, 2, 2, 1, 1, 1][i] try: import pandas as pd results = syn.tableQuery("select * from %s where Born=1930" % table.schema.id, resultsAs="csv") df = results.asDataFrame() all(df['Born'].values == 1930) all(df['Hipness'].values == 8.5) ## Update via a Data Frame df['Hipness'] = 9.75 table = syn.store(Table(table.tableId, df, etag=results.etag)) results = syn.tableQuery("select * from %s where Born=1930" % table.tableId, resultsAs="csv") for row in results: assert row[4] == 9.75 except ImportError as e1: sys.stderr.write( 'Pandas is apparently not installed, skipping part of test_tables_csv.\n\n' ) ## check what happens when query result is empty results = syn.tableQuery('select * from %s where Born=2013' % table.tableId, resultsAs="csv") assert len(list(results)) == 0 try: import pandas as pd results = syn.tableQuery('select * from %s where Born=2013' % table.tableId, resultsAs="csv") df = results.asDataFrame() assert df.shape[0] == 0 except ImportError as e1: sys.stderr.write( 'Pandas is apparently not installed, skipping part of test_tables_csv.\n\n' ) ## delete some rows results = syn.tableQuery('select * from %s where Hipness < 7' % table.tableId, resultsAs="csv") syn.delete(results)
def test_rowset_tables(): cols = [] cols.append(Column(name='name', columnType='STRING', maximumSize=1000)) cols.append( Column(name='foo', columnType='STRING', enumValues=['foo', 'bar', 'bat'])) cols.append(Column(name='x', columnType='DOUBLE')) cols.append(Column(name='age', columnType='INTEGER')) cols.append(Column(name='cartoon', columnType='BOOLEAN')) cols.append(Column(name='description', columnType='LARGETEXT')) schema1 = syn.store(Schema(name='Foo Table', columns=cols, parent=project)) ## Get columns associated with the given table retrieved_cols = list(syn.getTableColumns(schema1)) ## Test that the columns we get are the same as the ones we stored assert len(retrieved_cols) == len(cols) for retrieved_col, col in zip(retrieved_cols, cols): assert retrieved_col.name == col.name assert retrieved_col.columnType == col.columnType data1 = [['Chris', 'bar', 11.23, 45, False, 'a'], ['Jen', 'bat', 14.56, 40, False, 'b'], ['Jane', 'bat', 17.89, 6, False, 'c' * 1002], ['Henry', 'bar', 10.12, 1, False, 'd']] row_reference_set1 = syn.store( RowSet(schema=schema1, rows=[Row(r) for r in data1])) assert len(row_reference_set1['rows']) == 4 ## add more new rows data2 = [['Fred', 'bat', 21.45, 20, True, 'e'], ['Daphne', 'foo', 27.89, 20, True, 'f'], ['Shaggy', 'foo', 23.45, 20, True, 'g'], ['Velma', 'bar', 25.67, 20, True, 'h']] syn.store(RowSet(schema=schema1, rows=[Row(r) for r in data2])) results = syn.tableQuery("select * from %s order by name" % schema1.id, resultsAs="rowset") assert results.count == 8 assert results.tableId == schema1.id ## test that the values made the round trip expected = sorted(data1 + data2) for expected_values, row in zip(expected, results): assert expected_values == row['values'], 'got %s but expected %s' % ( row['values'], expected_values) ## To modify rows, we have to select then first. result2 = syn.tableQuery('select * from %s where age>18 and age<30' % schema1.id, resultsAs="rowset") ## make a change rs = result2.asRowSet() for row in rs['rows']: row['values'][2] = 88.888 ## store it row_reference_set = syn.store(rs) ## check if the change sticks result3 = syn.tableQuery('select name, x, age from %s' % schema1.id, resultsAs="rowset") for row in result3: if int(row['values'][2]) == 20: assert row['values'][1] == 88.888 ## Add a column bday_column = syn.store(Column(name='birthday', columnType='DATE')) column = syn.getColumn(bday_column.id) assert column.name == "birthday" assert column.columnType == "DATE" schema1.addColumn(bday_column) schema1 = syn.store(schema1) results = syn.tableQuery( 'select * from %s where cartoon=false order by age' % schema1.id, resultsAs="rowset") rs = results.asRowSet() ## put data in new column bdays = ('2013-3-15', '2008-1-3', '1973-12-8', '1969-4-28') for bday, row in zip(bdays, rs.rows): row['values'][6] = bday row_reference_set = syn.store(rs) ## query by date and check that we get back two kids date_2008_jan_1 = utils.to_unix_epoch_time(datetime(2008, 1, 1)) results = syn.tableQuery( 'select name from %s where birthday > %d order by birthday' % (schema1.id, date_2008_jan_1), resultsAs="rowset") assert ["Jane", "Henry"] == [row['values'][0] for row in results] try: import pandas as pd df = results.asDataFrame() assert all(df.ix[:, "name"] == ["Jane", "Henry"]) except ImportError as e1: sys.stderr.write( 'Pandas is apparently not installed, skipping part of test_rowset_tables.\n\n' ) results = syn.tableQuery( 'select birthday from %s where cartoon=false order by age' % schema1.id, resultsAs="rowset") for bday, row in zip(bdays, results): assert row['values'][0] == datetime.strptime( bday, "%Y-%m-%d"), "got %s but expected %s" % (row['values'][0], bday) try: import pandas as pd results = syn.tableQuery( "select foo, MAX(x), COUNT(foo), MIN(age) from %s group by foo order by foo" % schema1.id, resultsAs="rowset") df = results.asDataFrame() assert df.shape == (3, 4) assert all(df.iloc[:, 0] == ["bar", "bat", "foo"]) assert all(df.iloc[:, 1] == [88.888, 88.888, 88.888]) assert all(df.iloc[:, 2] == [3, 3, 2]) except ImportError as e1: sys.stderr.write( 'Pandas is apparently not installed, skipping part of test_rowset_tables.\n\n' ) ## test delete rows by deleting cartoon characters syn.delete( syn.tableQuery('select name from %s where cartoon = true' % schema1.id, resultsAs="rowset")) results = syn.tableQuery('select name from %s order by birthday' % schema1.id, resultsAs="rowset") assert ["Chris", "Jen", "Jane", "Henry"] == [row['values'][0] for row in results] ## check what happens when query result is empty results = syn.tableQuery('select * from %s where age > 1000' % schema1.id, resultsAs="rowset") assert len(list(results)) == 0 try: import pandas as pd results = syn.tableQuery('select * from %s where age > 1000' % schema1.id, resultsAs="rowset") df = results.asDataFrame() assert df.shape[0] == 0 except ImportError as e1: sys.stderr.write( 'Pandas is apparently not installed, skipping part of test_rowset_tables.\n\n' )
def files_to_synapse_table(in_files, synapse_project_id, table_name, column_name='fileID', username='', password=''): """ Upload files and file handle IDs to Synapse. Parameters ---------- in_files : list of strings paths to files to upload to Synapse synapse_project_id : string Synapse ID for project to which table is to be written table_name : string schema name of table column_name : string header for column of fileIDs username : string Synapse username (only needed once on a given machine) password : string Synapse password (only needed once on a given machine) Returns ------- synapse_project_id : string Synapse ID for project Examples -------- >>> from mhealthx.io_data import files_to_synapse_table >>> in_files = ['/Users/arno/Local/wav/test1.wav'] >>> synapse_project_id = 'syn4899451' >>> table_name = 'Test to store files and file handle IDs' >>> column_name = 'fileID1' >>> username = '' >>> password = '' >>> table_data, synapse_project_id = files_to_synapse_table(in_files, synapse_project_id, table_name, column_name, username, password) >>> #column_name = 'fileID2' >>> #in_files = ['/Users/arno/Local/wav/test2.wav'] >>> #table_data, synapse_project_id = files_to_synapse_table(in_files, synapse_project_id, table_name, column_name, username, password) """ import synapseclient from synapseclient import Schema from synapseclient.table import Column, RowSet, Row syn = synapseclient.Synapse() # Log in to Synapse: if username and password: syn.login(username, password) else: syn.login() # Store file handle IDs: files_handles = [] for in_file in in_files: file_handle = syn._chunkedUploadFile(in_file) files_handles.append([file_handle['id']]) # New column headers: new_column_header = Column(name=column_name, columnType='FILEHANDLEID') # See if Synapse table exists: # tex = list(syn.chunkedQuery("select id from Table where parentId=='{0}'" # " and name=='{1}'".format(synapse_project_id, # table_name))) # If Synapse table does not exist, create table schema: # if not tex: # Create table schema: schema = syn.store( Schema(name=table_name, columns=[new_column_header], parent=synapse_project_id)) # Upload files and file handle IDs with new schema: syn.store( RowSet(columns=[new_column_header], schema=schema, rows=[Row(r) for r in files_handles]))