def test_series_with_index(self): data = np.array(['a', 'b', 'c', 'd']) s = pd.Series(data, index=[100, 101, 102, 103]) push("test/pandas/series_with_index", s) s1 = pull("test/pandas/series_with_index") self.assertEqual(s.index.dtype, s1.index.dtype) self.assertTrue(s.equals(s1))
def test_nullable_types(self): df = pd.DataFrame({"tag1": [10, None], "tag2": [True, None]}) df1 = df.astype({"tag1": "Int64", "tag2": pd.BooleanDtype()}) push("test/pandas/nullable_types", df1, encoder=DataFrameEncoder(index=False)) df2 = pull("test/pandas/nullable_types") map(lambda x, y: self.assertEqual(x, y), zip(df2.dtypes, df1.dtypes))
def test_df_with_non_int_index(self): dates = pd.date_range('20130101', periods=6) df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD')) push("test/pandas/df_with_index_non_int", df) df1 = pull("test/pandas/df_with_index_non_int") self.assertEqual(df.index.dtype, df1.index.dtype) self.assertTrue(df.index.to_series().equals(df1.index.to_series()))
def test_with_schema(self): df = pd.DataFrame({ "float": [1.0], "int": [1], "datetime": [pd.Timestamp("20180310")], "string": ["foo"] }) push("test/pandas/df_with_schema", df, encoder=DataFrameEncoder(index=False)) df1 = pull("test/pandas/df_with_schema") map(lambda x, y: self.assertEqual(x, y), zip(df.dtypes, df1.dtypes)) self.assertEqual(df["datetime"][0], df1["datetime"][0])
def test_df_with_index(self): raw_data = { "first_name": ["John", "Donald", "Maryam", "Don", "Andrey"], "last_name": ["Milnor", "Knuth", "Mirzakhani", "Zagier", "Okunkov"], "birth_year": [1931, 1938, 1977, 1951, 1969], "school": ["Princeton", "Stanford", "Stanford", "MPIM", "Princeton"] } df = pd.DataFrame( raw_data, columns=["first_name", "last_name", "birth_year", "school"]) push("test/pandas/df_with_index", df) df1 = pull("test/pandas/df_with_index") self.assertTrue(df.index.to_series().equals(df1.index.to_series()))
def test_push_pull_linear_model(self): # generate regression dataset x, y = make_regression(n_samples=20, n_features=1, noise=0.75) # create the training and test datasets from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = \ train_test_split(x, y, test_size=0.3, random_state=1234) # train the simple Linear regression std_reg = LinearRegression() std_reg.fit(x_train, y_train) push("test/sklearn/my_linear_model", std_reg, "My first linear model") my_model: LinearRegression = pull("test/sklearn/my_linear_model") self.assertEqual(std_reg.coef_, my_model.coef_) self.assertEqual(std_reg.intercept_, my_model.intercept_) self.assertEqual(std_reg.normalize, my_model.normalize)
def test_simple_logistic_regression(self): data = load_breast_cancer() # normally we would put all of our imports at the top # but this lets us tell a story from sklearn.model_selection import train_test_split # split the data into train and test sets # this lets us simulate how our model will perform in the future x_train, x_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.33) n, d = x_train.shape # Scale the data # you"ll learn why scaling is needed in a later course from sklearn.preprocessing import StandardScaler scaler = StandardScaler() x_train = scaler.fit_transform(x_train) x_test = scaler.transform(x_test) # Now all the fun Tensorflow stuff # Build the model model = tf.keras.models.Sequential([ tf.keras.layers.Input(shape=(d,)), tf.keras.layers.Dense(1, activation="sigmoid") ]) # Alternatively, you can do: # model = tf.keras.models.Sequential() # model.add(tf.keras.layers.Dense(1, input_shape=(d,), activation="sigmoid")) model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"]) # Train the model r = model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=100) # Evaluate the model - evaluate() returns loss and accuracy print("Train score:", model.evaluate(x_train, y_train)) print("Test score:", model.evaluate(x_test, y_test)) push("my_tf_model", model, "My first TF model") model1 = pull("my_tf_model") self.assertTrue(isinstance(model1, tf.keras.models.Sequential))
def test_geo_df(self): df = pd.DataFrame({ 'City': ['Buenos Aires', 'Brasilia', 'Santiago', 'Bogota', 'Caracas'], 'Country': ['Argentina', 'Brazil', 'Chile', 'Colombia', 'Venezuela'], 'Latitude': [-34.58, -15.78, -33.45, 4.60, 10.48], 'Longitude': [-58.66, -47.91, -70.66, -74.08, -66.86] }) gdf = geopandas.GeoDataFrame(df, geometry=geopandas.points_from_xy( df.Longitude, df.Latitude)) push("my_first_geo", gdf) self.assertEqual( "application/zip", self.get_data("my_first_geo")["attachments"][0]["content_type"]) gdf1 = pull("my_first_geo") self.assertTrue(gdf.equals(gdf1))
def test_linear_regression_weights(self): # create dummy data for training x_values = [i for i in range(11)] x_train = np.array(x_values, dtype=np.float32) x_train = x_train.reshape(-1, 1) y_values = [2 * i + 1 for i in x_values] y_train = np.array(y_values, dtype=np.float32) y_train = y_train.reshape(-1, 1) class LinearRegression(torch.nn.Module): def __init__(self, input_size, output_size): super(LinearRegression, self).__init__() self.linear = torch.nn.Linear(input_size, output_size) def forward(self, x): out = self.linear(x) return out input_dim = 1 # takes variable 'x' output_dim = 1 # takes variable 'y' learning_rate = 0.01 epochs = 100 model = LinearRegression(input_dim, output_dim) criterion = torch.nn.MSELoss() optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) for epoch in range(epochs): # Converting inputs and labels to Variable if torch.cuda.is_available(): inputs = Variable(torch.from_numpy(x_train).cuda()) labels = Variable(torch.from_numpy(y_train).cuda()) else: inputs = Variable(torch.from_numpy(x_train)) labels = Variable(torch.from_numpy(y_train)) # Clear gradient buffers because we don't want any gradient from previous epoch to carry forward, # don't want to cumulate gradients optimizer.zero_grad() # get output from the model, given the inputs outputs = model(inputs) # get loss for the predicted output loss = criterion(outputs, labels) print(loss) # get gradients w.r.t to parameters loss.backward() # update parameters optimizer.step() print('epoch {}, loss {}'.format(epoch, loss.item())) from dstack.torch.handlers import TorchModelEncoder TorchModelEncoder.STORE_WHOLE_MODEL = False push("my_torch_model", model, "My first PyTorch model") model1 = LinearRegression(input_dim, output_dim) from dstack.torch.handlers import TorchModelWeightsDecoder my_model: LinearRegression = pull( "my_torch_model", decoder=TorchModelWeightsDecoder(model1)) self.assertEqual(model1, my_model) self.assertEqual(model.state_dict(), my_model.state_dict())
def test_series(self): data = np.array(['a', 'b', 'c', 'd']) s = pd.Series(data) push("test/pandas/series", s, encoder=SeriesEncoder(index=False)) s1 = pull("test/pandas/series") self.assertTrue(s.equals(s1))
def get_model(): return ds.pull("sklearn_model")