def test_series_with_index(self): data = np.array(['a', 'b', 'c', 'd']) s = pd.Series(data, index=[100, 101, 102, 103]) push("test/pandas/series_with_index", s) s1 = pull("test/pandas/series_with_index") self.assertEqual(s.index.dtype, s1.index.dtype) self.assertTrue(s.equals(s1))
def test_with_schema(self): md = Markdown("Test *markdown*") ds.push("test/md", md) frame_data = ds.pull_data(ds.create_context("test/md")) self.assertEqual("text/markdown", frame_data.content_type) self.assertEqual("markdown", frame_data.application) self.assertEqual(md.text, frame_data.data.value().decode("utf-8"))
def test_nullable_types(self): df = pd.DataFrame({"tag1": [10, None], "tag2": [True, None]}) df1 = df.astype({"tag1": "Int64", "tag2": pd.BooleanDtype()}) push("test/pandas/nullable_types", df1, encoder=DataFrameEncoder(index=False)) df2 = pull("test/pandas/nullable_types") map(lambda x, y: self.assertEqual(x, y), zip(df2.dtypes, df1.dtypes))
def test_tab(self): ds.push("test/my_plot", self.get_figure(), my_tab=ds.tab("My brand new tab")) t = self.get_data("test/my_plot")["attachments"][0]["params"]["my_tab"] self.assertIsNotNone(t) self.assertEqual("tab", t["type"]) self.assertEqual("My brand new tab", t["title"])
def prepare_server_stack(self, version: str) -> str: jar_path = self.create_fake_file("fake-server.jar") push(Installer._STACK, jar_path, profile=Installer._PROFILE, version=version, jdk_version="8", jdk_compatible_versions=self.java_version) return jar_path.name
def test_df_with_non_int_index(self): dates = pd.date_range('20130101', periods=6) df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD')) push("test/pandas/df_with_index_non_int", df) df1 = pull("test/pandas/df_with_index_non_int") self.assertEqual(df.index.dtype, df1.index.dtype) self.assertTrue(df.index.to_series().equals(df1.index.to_series()))
def test_push_params(self): stack = "test/my_plot" ds.push(stack, self.get_figure(), params={"z": 30}, meta=ds.FrameMeta(text="hello", x=10, y=20)) frame = self.get_data(stack) attachments = frame["attachments"] self.assertEqual(1, len(attachments[0]["params"])) self.assertEqual(30, attachments[0]["params"]["z"]) self.assertEqual(3, len(frame["params"])) self.assertEqual({"x": 10, "y": 20, "text": "hello"}, frame["params"])
def test_with_schema(self): df = pd.DataFrame({ "float": [1.0], "int": [1], "datetime": [pd.Timestamp("20180310")], "string": ["foo"] }) push("test/pandas/df_with_schema", df, encoder=DataFrameEncoder(index=False)) df1 = pull("test/pandas/df_with_schema") map(lambda x, y: self.assertEqual(x, y), zip(df.dtypes, df1.dtypes)) self.assertEqual(df["datetime"][0], df1["datetime"][0])
def test_df_with_index(self): raw_data = { "first_name": ["John", "Donald", "Maryam", "Don", "Andrey"], "last_name": ["Milnor", "Knuth", "Mirzakhani", "Zagier", "Okunkov"], "birth_year": [1931, 1938, 1977, 1951, 1969], "school": ["Princeton", "Stanford", "Stanford", "MPIM", "Princeton"] } df = pd.DataFrame( raw_data, columns=["first_name", "last_name", "birth_year", "school"]) push("test/pandas/df_with_index", df) df1 = pull("test/pandas/df_with_index") self.assertTrue(df.index.to_series().equals(df1.index.to_series()))
def test_download_jdk(self): fake_jdk = self.create_fake_archive("OpenJDK-1.8.0.121-x86_64-bin") self.assertTrue(fake_jdk.exists()) self.assertFalse(fake_jdk.is_dir()) push(f"{Installer._JDK_STACK_BASE}/8", fake_jdk, profile=Installer._PROFILE, os=self.installer.get_os()) self.installer._download_jdk("8") self.assertTrue(self.installer._jdk_path().exists()) self.assertTrue(self.installer._jdk_path().is_dir()) file_list = [p.name for p in self.installer._jdk_path().iterdir()] self.assertIn("file1.txt", file_list)
def test_push_pull_linear_model(self): # generate regression dataset x, y = make_regression(n_samples=20, n_features=1, noise=0.75) # create the training and test datasets from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = \ train_test_split(x, y, test_size=0.3, random_state=1234) # train the simple Linear regression std_reg = LinearRegression() std_reg.fit(x_train, y_train) push("test/sklearn/my_linear_model", std_reg, "My first linear model") my_model: LinearRegression = pull("test/sklearn/my_linear_model") self.assertEqual(std_reg.coef_, my_model.coef_) self.assertEqual(std_reg.intercept_, my_model.intercept_) self.assertEqual(std_reg.normalize, my_model.normalize)
def test_simple_logistic_regression(self): data = load_breast_cancer() # normally we would put all of our imports at the top # but this lets us tell a story from sklearn.model_selection import train_test_split # split the data into train and test sets # this lets us simulate how our model will perform in the future x_train, x_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.33) n, d = x_train.shape # Scale the data # you"ll learn why scaling is needed in a later course from sklearn.preprocessing import StandardScaler scaler = StandardScaler() x_train = scaler.fit_transform(x_train) x_test = scaler.transform(x_test) # Now all the fun Tensorflow stuff # Build the model model = tf.keras.models.Sequential([ tf.keras.layers.Input(shape=(d,)), tf.keras.layers.Dense(1, activation="sigmoid") ]) # Alternatively, you can do: # model = tf.keras.models.Sequential() # model.add(tf.keras.layers.Dense(1, input_shape=(d,), activation="sigmoid")) model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"]) # Train the model r = model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=100) # Evaluate the model - evaluate() returns loss and accuracy print("Train score:", model.evaluate(x_train, y_train)) print("Test score:", model.evaluate(x_test, y_test)) push("my_tf_model", model, "My first TF model") model1 = pull("my_tf_model") self.assertTrue(isinstance(model1, tf.keras.models.Sequential))
def test_geo_df(self): df = pd.DataFrame({ 'City': ['Buenos Aires', 'Brasilia', 'Santiago', 'Bogota', 'Caracas'], 'Country': ['Argentina', 'Brazil', 'Chile', 'Colombia', 'Venezuela'], 'Latitude': [-34.58, -15.78, -33.45, 4.60, 10.48], 'Longitude': [-58.66, -47.91, -70.66, -74.08, -66.86] }) gdf = geopandas.GeoDataFrame(df, geometry=geopandas.points_from_xy( df.Longitude, df.Latitude)) push("my_first_geo", gdf) self.assertEqual( "application/zip", self.get_data("my_first_geo")["attachments"][0]["content_type"]) gdf1 = pull("my_first_geo") self.assertTrue(gdf.equals(gdf1))
def test_per_frame_settings(self): ds.push("test/my_plot", self.get_figure()) self.assertEqual( python_version, self.get_data("test/my_plot")["settings"]["python"]["version"]) self.assertEqual( python_version_info.major, self.get_data("test/my_plot")["settings"]["python"]["major"]) self.assertEqual( python_version_info.minor, self.get_data("test/my_plot")["settings"]["python"]["minor"]) self.assertEqual( python_version_info.micro, self.get_data("test/my_plot")["settings"]["python"]["micro"]) self.assertEqual( python_version_info.releaselevel, self.get_data("test/my_plot")["settings"]["python"] ["releaselevel"]) self.assertEqual( python_version_info.serial, self.get_data("test/my_plot")["settings"]["python"]["serial"]) self.assertIn("os", self.get_data("test/my_plot")["settings"])
def test_stack_access(self): ds.push("test/my_plot", self.get_figure()) self.assertNotIn("access", self.get_data("test/my_plot")) ds.push("test/my_plot_1", self.get_figure(), access="public") self.assertEqual("public", self.get_data("test/my_plot_1")["access"]) ds.push("test/my_plot_2", self.get_figure(), access="private") self.assertEqual("private", self.get_data("test/my_plot_2")["access"])
import dstack.controls as ctrl import dstack as ds import plotly.express as px @ds.cache() def get_data(): return px.data.stocks() def symbols_handler(self: ctrl.ComboBox): print("Calling symbols_handler") self.items = get_data().columns[1:].tolist() def output_handler(self, ticker): print("Calling output_handler") self.data = px.line(get_data(), x='date', y=ticker.value()) app = ds.app(controls=[(ctrl.ComboBox(handler=symbols_handler))], outputs=[(ctrl.Output(handler=output_handler))]) result = ds.push("logs", app) print(result.url)
import dstack as ds import dstack.controls as ctrl import pandas as pd def app_handler(self: ctrl.Output, uploader: ctrl.FileUploader): if len(uploader.uploads) > 0: with uploader.uploads[0].open() as f: self.data = pd.read_csv(f).head(100) else: self.data = ds.md("No file selected") app = ds.app(controls=[ctrl.FileUploader(label="Select a CSV file")], outputs=[ctrl.Output(handler=app_handler)]) url = ds.push("controls/file_uploader", app) print(url)
import dstack.controls as ctrl import dstack as ds import plotly.express as px @ds.cache() def get_data(): return px.data.stocks() def output_handler(self, ticker): self.data = px.line(get_data(), x='date', y=ticker.value()) app = ds.app( controls=[(ctrl.ComboBox(items=get_data().columns[1:].tolist()))], outputs=[ ctrl.Output(data=ds.md( "Here's a simple application with **Markdown** and a chart.")), ctrl.Output(handler=output_handler) ]) result = ds.push("markdown", app) print(result.url)
from datetime import datetime, timedelta import dstack.controls as ctrl import dstack as ds import plotly.express as px import pandas_datareader as pdr def output_handler(self: ctrl.Output, ticker: ctrl.TextField): if len(ticker.text) > 0: start = datetime.today() - timedelta(days=30) end = datetime.today() df = pdr.data.DataReader(ticker.text, 'yahoo', start, end) self.data = px.line(df, x=df.index, y=df['High']) else: self.data = ds.md("No ticker selected") app = ds.app(controls=[ctrl.TextField(label="Select ticker")], outputs=[ctrl.Output(handler=output_handler)]) result = ds.push('controls/text_field', app) print(result.url)
X[col] = X[col] / X[col].max() for c in countries: X[c] = X["Country"].apply(lambda x: 1 if x == c else 0) for s in sectors: if s: X[s] = X["Sector"].apply(lambda x: 1 if x == s else 0) X = X.drop(["Country", "Sector"], axis=1) return X df = pd.read_csv("https://www.dropbox.com/s/cat8vm6lchlu5tp/data.csv?dl=1", index_col=0) countries = df["Country"].unique() sectors = df["Sector"].unique() X = df[df["RenewalMonth"] < 10].copy() y = X["Churn"] X = transform(X, countries, sectors) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=99) model = LogisticRegression() model.fit(X_train, y_train) url = ds.push("sklearn_model", model) print(url)
from datetime import datetime, timedelta import dstack.controls as ctrl import dstack as ds import plotly.express as px import pandas_datareader as pdr def ticker_handler(self: ctrl.ComboBox): self.items = ['FB', 'AMZN', 'AAPL', 'NFLX', 'GOOG'] def output_handler(self: ctrl.Output, ticker: ctrl.ComboBox): if ticker.selected > -1: start = datetime.today() - timedelta(days=30) end = datetime.today() df = pdr.data.DataReader(ticker.items[ticker.selected], 'yahoo', start, end) self.data = px.line(df, x=df.index, y=df['High']) else: self.data = ds.md("No ticker selected") app = ds.app( controls=[ctrl.ComboBox(label="Select ticker", handler=ticker_handler)], outputs=[ctrl.Output(handler=output_handler)]) result = ds.push('controls/combo_box', app) print(result.url)
import dstack.controls as ctrl import dstack as ds import plotly.express as px @ds.cache() def get_data(): return px.data.stocks() def output_handler(self, ticker): self.data = px.line(get_data(), x='date', y=ticker.value()) app = ds.app(controls=[(ctrl.ComboBox(items=get_data().columns[1:].tolist()))], outputs=[(ctrl.Output(handler=output_handler))]) result = ds.push("stocks", app) print(result.url)
def get_regions(): df = get_data() return df["Region"].unique().tolist() def countries_handler(self: ctrl.ComboBox, regions: ctrl.ComboBox): df = get_data() self.items = df[df["Region"] == regions.value()]["Country"].unique().tolist() regions = ctrl.ComboBox(items=get_regions, label="Region") countries = ctrl.ComboBox(handler=countries_handler, label="Country", multiple=True, depends=[regions]) def output_handler(self: ctrl.Output, countries: ctrl.ComboBox): df = get_data() self.data = df[df["Country"].isin(countries.value())] app = ds.app(controls=[regions, countries], outputs=[ds.Output(handler=output_handler, depends=[countries])]) result = ds.push('combo_box', app) print(result.url)
regions_ctrl = ctrl.ComboBox(x1["Region"].unique().tolist(), label="Region") months_ctrl = ctrl.ComboBox(['Oct', 'Nov', 'Dec'], label="Month") churn_ctrl = ctrl.CheckBox(label="Churn", selected=True, require_apply=False) def app_handler(self: ctrl.Output, regions_ctrl: ctrl.ComboBox, months_ctrl: ctrl.ComboBox, churn_ctrl: ctrl.CheckBox): x1, x1a = get_data() y1_pred = get_model().predict(x1a) data = x1.copy() data["Predicted Churn"] = y1_pred data["Predicted Churn"] = data["Predicted Churn"].apply( lambda x: "Yes" if x == 1.0 else "No") data["RenewalMonth"] = data["RenewalMonth"].apply(lambda x: months[x - 1]) data = data.drop(["y2015", "y2016", "y2017", "y2018", "y2019", "Churn"], axis=1) data = data[(data["Predicted Churn"] == ("Yes" if churn_ctrl.selected else "No"))] data = data[(data["Region"] == regions_ctrl.value())] data = data[(data["RenewalMonth"] == months_ctrl.value())] self.data = data app = ds.app(controls=[regions_ctrl, months_ctrl, churn_ctrl], outputs=[ctrl.Output(handler=app_handler)]) url = ds.push("sklearn", app) print(url)
def test_linear_regression_weights(self): # create dummy data for training x_values = [i for i in range(11)] x_train = np.array(x_values, dtype=np.float32) x_train = x_train.reshape(-1, 1) y_values = [2 * i + 1 for i in x_values] y_train = np.array(y_values, dtype=np.float32) y_train = y_train.reshape(-1, 1) class LinearRegression(torch.nn.Module): def __init__(self, input_size, output_size): super(LinearRegression, self).__init__() self.linear = torch.nn.Linear(input_size, output_size) def forward(self, x): out = self.linear(x) return out input_dim = 1 # takes variable 'x' output_dim = 1 # takes variable 'y' learning_rate = 0.01 epochs = 100 model = LinearRegression(input_dim, output_dim) criterion = torch.nn.MSELoss() optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) for epoch in range(epochs): # Converting inputs and labels to Variable if torch.cuda.is_available(): inputs = Variable(torch.from_numpy(x_train).cuda()) labels = Variable(torch.from_numpy(y_train).cuda()) else: inputs = Variable(torch.from_numpy(x_train)) labels = Variable(torch.from_numpy(y_train)) # Clear gradient buffers because we don't want any gradient from previous epoch to carry forward, # don't want to cumulate gradients optimizer.zero_grad() # get output from the model, given the inputs outputs = model(inputs) # get loss for the predicted output loss = criterion(outputs, labels) print(loss) # get gradients w.r.t to parameters loss.backward() # update parameters optimizer.step() print('epoch {}, loss {}'.format(epoch, loss.item())) from dstack.torch.handlers import TorchModelEncoder TorchModelEncoder.STORE_WHOLE_MODEL = False push("my_torch_model", model, "My first PyTorch model") model1 = LinearRegression(input_dim, output_dim) from dstack.torch.handlers import TorchModelWeightsDecoder my_model: LinearRegression = pull( "my_torch_model", decoder=TorchModelWeightsDecoder(model1)) self.assertEqual(model1, my_model) self.assertEqual(model.state_dict(), my_model.state_dict())
import dstack as ds import dstack.controls as ctrl from handlers import fake_handler app = ds.app(outputs=[ctrl.Output(handler=fake_handler)], depends=["handlers", "utils"], requirements="requirements.txt") # An equal alternative to this is the following: # ds.app(outputs=[ctrl.Output(handler=fake_handler)], depends=["numpy", "pandas", "faker==5.5.0", "handlers", "utils"]) url = ds.push("depends", app) print(url)
def test_series(self): data = np.array(['a', 'b', 'c', 'd']) s = pd.Series(data) push("test/pandas/series", s, encoder=SeriesEncoder(index=False)) s1 = pull("test/pandas/series") self.assertTrue(s.equals(s1))
import dstack.controls as ctrl import dstack as ds import plotly.express as px @ds.cache() def get_data(): return px.data.gapminder() def output_handler(self: ctrl.Output, year: ctrl.Slider): year = year.values[year.selected] self.data = px.scatter(get_data().query("year==" + str(year)), x="gdpPercap", y="lifeExp", size="pop", color="country", hover_name="country", log_x=True, size_max=60) app = ds.app(controls=[ ctrl.Slider(values=get_data()["year"].unique().tolist(), require_apply=False) ], outputs=[ctrl.Output(handler=output_handler)]) result = ds.push('controls/slider', app) print(result.url)
def test_per_frame_settings(self): ds.push("test/my_plot", self.get_figure()) self.assertEqual(python_version, self.get_data("test/my_plot")["settings"]["python"]) self.assertIn("os", self.get_data("test/my_plot")["settings"])