def dontruntest_big_csvs(syn, project, schedule_for_cleanup): cols = [ Column(name='name', columnType='STRING', maximumSize=1000), Column(name='foo', columnType='STRING', enumValues=['foo', 'bar', 'bat']), Column(name='x', columnType='DOUBLE'), Column(name='n', columnType='INTEGER'), Column(name='is_bogus', columnType='BOOLEAN') ] schema1 = syn.store(Schema(name='Big Table', columns=cols, parent=project)) # write rows to CSV file with tempfile.NamedTemporaryFile(delete=False) as temp: schedule_for_cleanup(temp.name) filename = temp.name with io.open(filename, mode='w', encoding="utf-8", newline='') as temp: writer = csv.writer(temp, quoting=csv.QUOTE_NONNUMERIC, lineterminator=str(os.linesep)) writer.writerow([col.name for col in cols]) for i in range(10): for j in range(100): foo = cols[1].enumValues[random.randint(0, 2)] writer.writerow( ('Robot ' + str(i * 100 + j), foo, random.random() * 200.0, random.randint(0, 100), random.random() >= 0.5)) # upload CSV syn._uploadCsv(filepath=temp.name, schema=schema1) from synapseclient.table import CsvFileTable CsvFileTable.from_table_query(syn, "select * from %s" % schema1.id)
def test_insert_dataframe_column_if_not_exist__nonexistent_column(): df, column_name, data = _insert_dataframe_column_if_not_exist__setup() # method under test CsvFileTable._insert_dataframe_column_if_not_exist(df, 0, column_name, data) # make sure the data was inserted assert_equals(data, df[column_name].tolist())
def test_insert_dataframe_column_if_not_exist__nonexistent_column(): df, column_name, data = _insert_dataframe_column_if_not_exist__setup() # method under test CsvFileTable._insert_dataframe_column_if_not_exist(df, 0, column_name, data) # make sure the data was inserted assert data == df[column_name].tolist()
def test_iter_with_no_headers_in_csv(self): # csv file does not have headers string_io = StringIOContextManager("1,2,etag1,\"I like trains\"\n" "5,1,etag2,\"weeeeeeeeeeee\"\n") with patch.object(io, "open", return_value=string_io): table = CsvFileTable("syn123", "/fake/file/path", header=False) iter = table.__iter__() assert_raises(ValueError, next, iter)
def test_iter_with_no_headers_in_csv(self): # csv file does not have headers string_io = StringIOContextManager("1,2,etag1,\"I like trains\"\n" "5,1,etag2,\"weeeeeeeeeeee\"\n") with patch.object(io, "open", return_value=string_io): table = CsvFileTable("syn123", "/fake/file/path", header=False) iter = table.__iter__() assert_raises(ValueError, next, iter)
def test_iter_with_no_headers(self): # self.headers is None string_io = StringIOContextManager("ROW_ID,ROW_VERSION,ROW_ETAG,col\n" "1,2,etag1,\"I like trains\"\n" "5,1,etag2,\"weeeeeeeeeeee\"\n") with patch.object(io, "open", return_value=string_io): table = CsvFileTable("syn123", "/fake/file/path") iter = table.__iter__() assert_raises(ValueError, next, iter)
def test_iter_with_no_headers(self): # self.headers is None string_io = StringIOContextManager("ROW_ID,ROW_VERSION,ROW_ETAG,col\n" "1,2,etag1,\"I like trains\"\n" "5,1,etag2,\"weeeeeeeeeeee\"\n") with patch.object(io, "open", return_value=string_io): table = CsvFileTable("syn123", "/fake/file/path") iter = table.__iter__() assert_raises(ValueError, next, iter)
def test_iter_metadata__has_etag(self): string_io = StringIOContextManager("ROW_ID,ROW_VERSION,ROW_ETAG,asdf\n" "1,2,etag1,\"I like trains\"\n" "5,1,etag2,\"weeeeeeeeeeee\"\n") with patch.object(io, "open", return_value=string_io): csv_file_table = CsvFileTable("syn123", "/fake/file/path") metadata = [x for x in csv_file_table.iter_row_metadata()] assert 2 == len(metadata) assert (1, 2, "etag1") == metadata[0] assert (5, 1, "etag2") == metadata[1]
def test_iter_metadata__no_etag(self): string_io = StringIOContextManager("ROW_ID,ROW_VERSION,asdf\n" "1,2,\"I like trains\"\n" "5,1,\"weeeeeeeeeeee\"\n") with patch.object(io, "open", return_value=string_io): csv_file_table = CsvFileTable("syn123", "/fake/file/path") metadata = [x for x in csv_file_table.iter_row_metadata()] assert_equals(2, len(metadata)) assert_equals((1, 2, None), metadata[0]) assert_equals((5, 1, None), metadata[1])
def test_iter_metadata__no_etag(self): string_io = StringIOContextManager("ROW_ID,ROW_VERSION,asdf\n" "1,2,\"I like trains\"\n" "5,1,\"weeeeeeeeeeee\"\n") with patch.object(io, "open", return_value=string_io): csv_file_table = CsvFileTable("syn123", "/fake/file/path") metadata = [x for x in csv_file_table.iter_row_metadata()] assert_equals(2, len(metadata)) assert_equals((1, 2, None), metadata[0]) assert_equals((5, 1, None), metadata[1])
def test_insert_dataframe_column_if_not_exist__existing_column_not_matching(): df, column_name, data = _insert_dataframe_column_if_not_exist__setup() # add different data to the DataFrame prior to calling our method df.insert(0, column_name, ['mercy', 'main', 'btw']) # make sure the data is different assert_not_equals(data, df[column_name].tolist()) # method under test should raise exception CsvFileTable._insert_dataframe_column_if_not_exist(df, 0, column_name, data)
def test_insert_dataframe_column_if_not_exist__existing_column_matching(): df, column_name, data = _insert_dataframe_column_if_not_exist__setup() # add the same data to the DataFrame prior to calling our method df.insert(0, column_name, data) # method under test CsvFileTable._insert_dataframe_column_if_not_exist(df, 0, column_name, data) # make sure the data has not changed assert_equals(data, df[column_name].tolist())
def test_insert_dataframe_column_if_not_exist__existing_column_matching(): df, column_name, data = _insert_dataframe_column_if_not_exist__setup() # add the same data to the DataFrame prior to calling our method df.insert(0, column_name, data) # method under test CsvFileTable._insert_dataframe_column_if_not_exist(df, 0, column_name, data) # make sure the data has not changed assert data == df[column_name].tolist()
def test_insert_dataframe_column_if_not_exist__existing_column_not_matching(): df, column_name, data = _insert_dataframe_column_if_not_exist__setup() # add different data to the DataFrame prior to calling our method df.insert(0, column_name, ['mercy', 'main', 'btw']) # make sure the data is different assert_not_equals(data, df[column_name].tolist()) # method under test should raise exception CsvFileTable._insert_dataframe_column_if_not_exist(df, 0, column_name, data)
def test_iter_row_metadata_mismatch_in_headers(self): # csv file does not contain row metadata, self.headers does data = "col1,col2\n" \ "1,2\n" \ "2,1\n" cols = as_table_columns(StringIOContextManager(data)) headers = [SelectColumn(name="ROW_ID", columnType="STRING"), SelectColumn(name="ROW_VERSION", columnType="STRING")] + \ [SelectColumn.from_column(col) for col in cols] with patch.object(io, "open", return_value=StringIOContextManager(data)): table = CsvFileTable("syn123", "/fake/file/path", headers=headers) iter = table.__iter__() assert_raises(ValueError, next, iter)
def test_iter_with_mismatch_row_metadata(self): # self.headers and csv file headers contains mismatch row metadata data = "ROW_ID,ROW_VERSION,ROW_ETAG,col\n" \ "1,2,etag1,\"I like trains\"\n" \ "5,1,etag2,\"weeeeeeeeeeee\"\n" cols = as_table_columns(StringIOContextManager(data)) headers = [SelectColumn(name="ROW_ID", columnType="STRING"), SelectColumn(name="ROW_VERSION", columnType="STRING")] + \ [SelectColumn.from_column(col) for col in cols] with patch.object(io, "open", return_value=StringIOContextManager(data)): table = CsvFileTable("syn123", "/fake/file/path", headers=headers) iter = table.__iter__() assert_raises(ValueError, next, iter)
def test_insert_dataframe_column_if_not_exist__nonexistent_column(): if pandas_found: raise SkipTest( "pandas could not be found. please let the pandas into your library." ) df, column_name, data = _insert_dataframe_column_if_not_exist__setup() #method under test CsvFileTable._insert_dataframe_column_if_not_exist(df, 0, column_name, data) #make sure the data was inserted assert_equals(data, df[column_name].tolist())
def test_iter_row_metadata_mismatch_in_headers(self): # csv file does not contain row metadata, self.headers does data = "col1,col2\n" \ "1,2\n" \ "2,1\n" cols = as_table_columns(StringIOContextManager(data)) headers = [SelectColumn(name="ROW_ID", columnType="STRING"), SelectColumn(name="ROW_VERSION", columnType="STRING")] + \ [SelectColumn.from_column(col) for col in cols] with patch.object(io, "open", return_value=StringIOContextManager(data)): table = CsvFileTable("syn123", "/fake/file/path", headers=headers) iter = table.__iter__() assert_raises(ValueError, next, iter)
def test_iter_with_mismatch_row_metadata(self): # self.headers and csv file headers contains mismatch row metadata data = "ROW_ID,ROW_VERSION,ROW_ETAG,col\n" \ "1,2,etag1,\"I like trains\"\n" \ "5,1,etag2,\"weeeeeeeeeeee\"\n" cols = as_table_columns(StringIOContextManager(data)) headers = [SelectColumn(name="ROW_ID", columnType="STRING"), SelectColumn(name="ROW_VERSION", columnType="STRING")] + \ [SelectColumn.from_column(col) for col in cols] with patch.object(io, "open", return_value=StringIOContextManager(data)): table = CsvFileTable("syn123", "/fake/file/path", headers=headers) iter = table.__iter__() assert_raises(ValueError, next, iter)
def test_insert_dataframe_column_if_not_exist__existing_column_not_matching(): if pandas_found: raise SkipTest( "pandas could not be found. please let the pandas into your library." ) df, column_name, data = _insert_dataframe_column_if_not_exist__setup() #add different data to the DataFrame prior to calling our method df.insert(0, column_name, ['mercy', 'main', 'btw']) #make sure the data is different assert_not_equals(data, df[column_name].tolist()) #method under test should raise exception CsvFileTable._insert_dataframe_column_if_not_exist(df, 0, column_name, data)
def dontruntest_big_csvs(): cols = [] cols.append(Column(name='name', columnType='STRING', maximumSize=1000)) cols.append(Column(name='foo', columnType='STRING', enumValues=['foo', 'bar', 'bat'])) cols.append(Column(name='x', columnType='DOUBLE')) cols.append(Column(name='n', columnType='INTEGER')) cols.append(Column(name='is_bogus', columnType='BOOLEAN')) schema1 = syn.store(Schema(name='Big Table', columns=cols, parent=project)) print "Created table:", schema1.id print "with columns:", schema1.columnIds ## write rows to CSV file with tempfile.NamedTemporaryFile(delete=False) as temp: schedule_for_cleanup(temp.name) writer = csv.writer(temp, quoting=csv.QUOTE_NONNUMERIC, lineterminator=os.linesep) writer.writerow([col.name for col in cols]) for i in range(10): for j in range(100): foo = cols[1].enumValues[random.randint(0,2)] writer.writerow(('Robot ' + str(i*100 + j), foo, random.random()*200.0, random.randint(0,100), random.random()>=0.5)) print "wrote 100 rows to disk" ## upload CSV UploadToTableResult = syn._uploadCsv(filepath=temp.name, schema=schema1) from synapseclient.table import CsvFileTable results = CsvFileTable.from_table_query(syn, "select * from %s" % schema1.id) print "etag:", results.etag print "tableId:", results.tableId for row in results: print row
def test_insert_dataframe_column_if_not_exist__existing_column_matching(): if pandas_found: raise SkipTest( "pandas could not be found. please let the pandas into your library." ) df, column_name, data = _insert_dataframe_column_if_not_exist__setup() #add the same data to the DataFrame prior to calling our method df.insert(0, column_name, data) #method under test CsvFileTable._insert_dataframe_column_if_not_exist(df, 0, column_name, data) #make sure the data has not changed assert_equals(data, df[column_name].tolist())
def test_iter_with_table_row_metadata(self): # csv file has row metadata, self.headers does not data = "ROW_ID,ROW_VERSION,col\n" \ "1,2,\"I like trains\"\n" \ "5,1,\"weeeeeeeeeeee\"\n" cols = as_table_columns(StringIOContextManager(data)) headers = [SelectColumn.from_column(col) for col in cols] with patch.object(io, "open", return_value=StringIOContextManager(data)): table = CsvFileTable("syn123", "/fake/file/path", headers=headers) expected_rows = [["I like trains"], ["weeeeeeeeeeee"]] for expected_row, table_row in zip(expected_rows, table): assert expected_row == table_row
def test_iter_no_row_metadata(self): # both csv headers and self.headers do not contains row metadata data = "col1,col2\n" \ "1,2\n" \ "2,1\n" cols = as_table_columns(StringIOContextManager(data)) headers = [SelectColumn.from_column(col) for col in cols] with patch.object(io, "open", return_value=StringIOContextManager(data)): table = CsvFileTable("syn123", "/fake/file/path", headers=headers) expected_rows = [[1, 2], [2, 1]] for expected_row, table_row in zip(expected_rows, table): assert expected_row == table_row
def test_iter_with_file_view_row_metadata(self): # csv file and self.headers contain matching row metadata data = "ROW_ID,ROW_VERSION,ROW_ETAG,col\n" \ "1,2,etag1,\"I like trains\"\n" \ "5,1,etag2,\"weeeeeeeeeeee\"\n" cols = as_table_columns(StringIOContextManager(data)) headers = [SelectColumn(name="ROW_ID", columnType="STRING"), SelectColumn(name="ROW_VERSION", columnType="STRING"), SelectColumn(name="ROW_ETAG", columnType="STRING")] + \ [SelectColumn.from_column(col) for col in cols] with patch.object(io, "open", return_value=StringIOContextManager(data)): table = CsvFileTable("syn123", "/fake/file/path", headers=headers) expected_rows = [['1', '2', "etag1", "I like trains"], ['5', '1', "etag2", "weeeeeeeeeeee"]] for expected_row, table_row in zip(expected_rows, table): assert expected_row == table_row
def dontruntest_big_csvs(): cols = [] cols.append(Column(name='name', columnType='STRING', maximumSize=1000)) cols.append( Column(name='foo', columnType='STRING', enumValues=['foo', 'bar', 'bat'])) cols.append(Column(name='x', columnType='DOUBLE')) cols.append(Column(name='n', columnType='INTEGER')) cols.append(Column(name='is_bogus', columnType='BOOLEAN')) schema1 = syn.store(Schema(name='Big Table', columns=cols, parent=project)) print("Created table:", schema1.id) print("with columns:", schema1.columnIds) ## write rows to CSV file with tempfile.NamedTemporaryFile(delete=False) as temp: schedule_for_cleanup(temp.name) filename = temp.name with io.open(filename, mode='w', encoding="utf-8", newline='') as temp: writer = csv.writer(temp, quoting=csv.QUOTE_NONNUMERIC, lineterminator=str(os.linesep)) writer.writerow([col.name for col in cols]) for i in range(10): for j in range(100): foo = cols[1].enumValues[random.randint(0, 2)] writer.writerow( ('Robot ' + str(i * 100 + j), foo, random.random() * 200.0, random.randint(0, 100), random.random() >= 0.5)) print("wrote 100 rows to disk") ## upload CSV UploadToTableResult = syn._uploadCsv(filepath=temp.name, schema=schema1) from synapseclient.table import CsvFileTable results = CsvFileTable.from_table_query(syn, "select * from %s" % schema1.id) print("etag:", results.etag) print("tableId:", results.tableId) for row in results: print(row)