def get(self, name, dataset): model = load_model(name) metadata = load_model_metadata(name) df = get_dataset(dataset) # TODO: make it optional if metadata['configuration']: configuration = load_configuration(metadata['dataset'], metadata['configuration']) modify(df, configuration) label = metadata['label'] data_x = df.drop(label, axis=1) data_y = df[label] predicts = model.predict_classes(data_x) score = float((data_y == predicts).sum() / predicts.size) conf_matrix = confusion_matrix(data_y, predicts) plots = OrderedDict() plot_classification_predictions(data_y, predicts, orientation='vertical', stacked=False) plots['predictions'] = plot_to_base64() return { 'score': score, 'plots': plots, 'confusionMatrix': conf_matrix.tolist() }
def get(self, training, testing): train = get_dataset(training, 5) test = get_dataset(testing, 5) result = { 'length': len(train.columns) == len(test.columns), 'columns': False, 'types': False, } if result['length']: result['columns'] = bool((train.columns == test.columns).all()) result['types'] = bool( (train.dtypes.asobject == test.dtypes.asobject).all()) return result
def post(self): dataset = api.payload['dataset'] df = get_dataset(dataset) if 'configuration' in api.payload: configuration_name = api.payload['configuration'] configuration = load_configuration(dataset, configuration_name) modify(df, configuration) df = shuffle(df) label = api.payload['labelColumn'] data_x = df.drop(label, axis=1) data_y = df[label] input_dim = data_x.columns.size output_dim = data_y.unique().size epochs = api.payload['epochs'] layers = api.payload['layers'] for layer in layers: if layer <= 0: return "Invalid layer", 400 validation_split = api.payload[ 'validationSplit'] if 'validationSplit' in api.payload else 0 keras.backend.clear_session() model = ClassificationModel(input_dim, output_dim, layers) history = model.fit(x=data_x, y=data_y, epochs=epochs, verbose=2, validation_split=validation_split) predicts = model.predict_classes(data_x) score = float((data_y == predicts).sum() / predicts.size) conf_matrix = confusion_matrix(data_y, predicts) plots = OrderedDict() plot_history_accuracy(history) plots['accuracy'] = plot_to_base64() plot_history_loss(history) plots['loss'] = plot_to_base64() plot_classification_predictions(data_y, predicts, orientation='vertical', stacked=False) plots['predictions'] = plot_to_base64() return { 'score': score, 'plots': plots, 'confusionMatrix': conf_matrix.tolist() }
def post(self): dataset = api.payload['dataset'] df = get_dataset(dataset) if 'configuration' in api.payload: configuration_name = api.payload['configuration'] configuration = load_configuration(dataset, configuration_name) modify(df, configuration) label = api.payload['labelColumn'] data_x = df.drop(label, axis=1) data_y = df[label] input_dim = data_x.columns.size output_dim = data_y.unique().size epochs = api.payload['epochs'] layers = api.payload['layers'] for layer in layers: if layer <= 0: return "Invalid layer", 400 train_scores = [] test_scores = [] kfolds = api.payload['kfolds'] if 'kfolds' in api.payload else 10 kf = StratifiedKFold(n_splits=kfolds, shuffle=True) for train_indices, test_indices in kf.split(data_x, data_y): train_x, train_y = data_x.iloc[train_indices], data_y.iloc[ train_indices] test_x, test_y = data_x.iloc[test_indices], data_y.iloc[ test_indices] keras.backend.clear_session() model = ClassificationModel(input_dim, output_dim, layers) model.fit(x=train_x, y=train_y.values, epochs=epochs, verbose=2) predicts = model.predict_classes(train_x) score = float((train_y == predicts).sum() / predicts.size) train_scores.append(score) predicts = model.predict_classes(test_x) score = float((test_y == predicts).sum() / predicts.size) test_scores.append(score) plots = OrderedDict() plot_cross_validation(train_scores, test_scores, plot_type='bar') plots['crossValidation'] = plot_to_base64() plot_cross_validation(train_scores, test_scores, plot_type='plot') plots['crossValidation2'] = plot_to_base64() return { 'trainScores': train_scores, 'testScores': test_scores, 'plots': plots, }
def get(self, dataset, rows): df = get_dataset(dataset, rows, True) columns = df.columns.values.tolist() column_types = list(map(lambda t: t.name, df.dtypes.values)) data_preview = df.values.tolist() return { 'columns': columns, 'columnTypes': column_types, 'rows': data_preview }
def get(self, dataset, configuration=None): result = try_load(dataset, configuration) if result: return result df = get_dataset(dataset) if configuration: loaded_configuration = load_configuration(dataset, configuration) modify(df, loaded_configuration) columns = [] for column_name in df: series = df[column_name] describe = series.describe() is_numeric = np.issubdtype(series.dtype.type, np.number) if is_numeric: descriptive_statistics = { "count": int(describe["count"]), "mean": describe["mean"], "std": describe["std"], "min": describe["min"], "p25": describe['25%'], "p50": describe["50%"], "p75": describe["75%"], "max": describe["max"] } else: descriptive_statistics = { "count": int(describe["count"]), "unique": int(describe["unique"]) } plots = {} plot_histogram(series, column_name, is_numeric) plots['histogram'] = plot_to_base64() if is_numeric: plot_box_and_violin(series) plots['boxplot'] = plot_to_base64() columns.append({ "name": column_name, "type": series.dtype.name, "numeric": is_numeric, "descriptiveStatistics": descriptive_statistics, "plots": plots }) result = { "columns": columns } store(dataset, configuration, result) return result
def get(self, dataset): df = get_dataset(dataset) columns = [] for column in df: series = df[column] columns.append({ 'name': column, 'type': series.dtype.name, 'hasNA': bool(series.isnull().any()) }) return {'columns': columns}
def post(self): experiment_name = api.payload['name'] train_dataset_name = api.payload['trainingDataset'] test_dataset_name = api.payload['testingDataset'] label_column_name = api.payload['labelColumnName'] train = get_dataset(train_dataset_name) test = get_dataset(test_dataset_name) train_x, train_y_raw = self._split_xy(train, label_column_name) test_x, test_y_raw = self._split_xy(test, label_column_name) train_y, classes = train_y_raw.factorize(sort=True) test_y, _ = test_y_raw.factorize(sort=True) # prepare model model = keras.Sequential([ keras.layers.Dense(16, activation=tf.nn.relu, input_shape=train_x.shape[1:]), keras.layers.Dense(len(classes), activation=tf.nn.softmax) ]) model.summary() # compile model model.compile(optimizer=tf.train.AdamOptimizer(), loss='sparse_categorical_crossentropy', metrics=['accuracy']) # train model.fit(train_x, train_y, epochs=15, validation_split=0.1) # test test_loss, test_acc = model.evaluate(test_x, test_y) keras.models.save_model(model, 'data/models/{}.h5py'.format(experiment_name)) return {'testLoss': test_loss, 'testAccuracy': test_acc}
def put(self, dataset): ratio = api.payload['ratio'] shuffle = api.payload['shuffle'] train_dataset_name = api.payload['trainDataset'] test_dataset_name = api.payload['testDataset'] df = get_dataset(dataset) train, test = train_test_split(df, ratio, shuffle) save_dataset(train, train_dataset_name) save_dataset(test, test_dataset_name) return None, 200
def put(self, dataset): df = get_dataset(dataset) columns = api.payload['columns'] if 'newDatasetName' in api.payload and api.payload['newDatasetName']: dataset_name = api.payload['newDatasetName'] modify(df, columns) save_dataset(df, dataset_name) if 'configurationName' in api.payload and api.payload[ 'configurationName']: configuration_name = api.payload['configurationName'] save_configuration(dataset, configuration_name, columns) return None, 200