def test_sklearn_deploy(self, mock_post, mock_put): model_name = 'test-model' mock_post_response = Mock() mock_post_response.json.return_value = {'url': 'http://test-url', 'tier': 0} mock_post_response.status_code = 200 mock_post.return_value = mock_post_response mock_put_response = Mock() mock_put_response.json.return_value = {} mock_put_response.status_code = 200 mock_put.return_value = mock_put_response if path.exists(model_name): clf = skljson.from_json(model_name) else: X, y = make_classification(n_samples=50, n_features=3, n_classes=3, n_informative=3, n_redundant=0, random_state=0, shuffle=False) clf = RandomForestClassifier() clf.fit(X, y) skljson.to_json(clf, model_name) sklearn = SKLearn('test') sklearn.deploy(clf, model_name) mock_post.assert_called_once() mock_put.assert_called_once()
def deserialize_model(path): # Load (or deserialize) model from JSON model = skljson.from_json(path) # Convert coeficients to numpy arrays to enable JSON deserialization # This is a hack to compensate for a bug in sklearn_json for i, x in enumerate(model.coefs_): model.coefs_[i] = np.array(x) return model
def __init__(self, modelFile, game, config="configs/config.ini"): ext = modelFile.split(sep='.')[-1] self.model = None self.game = game if (config == None): self.config = "configs/config.ini" else: self.config = config if (ext == "json"): self.model = (skljson.from_json(modelFile)) elif (ext == "pickle"): with open(modelFile, 'rb') as f: self.model = pickle.load(f) else: print("Unknown extention")
def check_sparse_model_json(self, model, model_name, abs=False): # Given if abs: model.fit(np.absolute(self.X_sparse), self.y_sparse) else: model.fit(self.X_sparse, self.y_sparse) # When serialized_model = skljson.to_json(model, model_name) deserialized_model = skljson.from_json(model_name) # Then expected_predictions = model.predict(self.X) actual_predictions = deserialized_model.predict(self.X) testing.assert_array_equal(expected_predictions, actual_predictions)
def predict( path="", model=None, prefix="", out_dir="", ): """ ---- path: str Path to serialized/pickeld training set model: sklearn model Path to serialized RandomForest classifier (trained) prefix: prefix for .csv file with prediction results and serialized model out_dir: path to create output folder. """ if model is None: model = Path(__file__).parent / "data/model/UP000005640_9606_llps.json" try: logger.info("Loading model: {m}", m=model) clf = skljson.from_json(model) except Exception: logger.error("classifier {mod} not found. Does the file exist?", mod=model) mat = export_matrix(prefix=prefix, fasta_path=path, out_path=out_dir) # Preprocessing data_ps = preprocess_and_scaledata(mat.df) X = data_ps.select_dtypes([np.number]) logger.info("Predicting PSAP_score") psap_prediction = pd.DataFrame(index=data_ps["protein_name"]) psap_prediction["PSAP_score"] = clf.predict_proba(X)[:, 1] psap_prediction["rank"] = 0 rank = psap_prediction["PSAP_score"].rank(ascending=False) psap_prediction["rank"] = rank # # Make directory for output out_dir = Path(out_dir) out_dir.mkdir(parents=True, exist_ok=True) out_file = out_dir / f"prediction_{prefix}.csv" logger.info("Writing results to: {csv}", csv=out_file) psap_prediction.to_csv(out_file)
def test_sklearn_model_exceeded(self, mock_post): model_name = 'test-model-1mb' mock_post_response = Mock() mock_post_response.json.return_value = {'url': 'http://test-url', 'tier': 0} mock_post_response.status_code = 200 mock_post.return_value = mock_post_response if path.exists(model_name): clf = skljson.from_json(model_name) else: X, y = make_classification(n_samples=15000, n_features=10, n_classes=3, n_informative=3, n_redundant=0, random_state=0, shuffle=False) clf = RandomForestClassifier() clf.fit(X, y) skljson.to_json(clf, model_name) sklearn = SKLearn('test') with self.assertRaises(mlrequest.ModelSizeExceeded) as exception: sklearn.deploy(clf, model_name) mock_post.assert_called_once()
def load_serialized(filename_with_path): """ load a serialized model """ if not os.path.isfile(filename_with_path): print( "{} is not a valid file, please check".format(filename_with_path)) return feature_selection_filename = filename_with_path.replace( "_reg.json", "_fs.json") feature_selection = None if os.path.isfile(feature_selection_filename): feature_selection = json_load(feature_selection_filename) regressor = skljson.from_json(filename_with_path) # override n_jobs to prevent warning, model should be fast enough # n_jobs helps during training regressor.n_jobs = 1 class Model: """ wrapper to the serialized scikit learn model, that uses feature selection in the first step """ def __init__(self, regressor, fs=None): self._regressor = regressor self._fs = fs def feature_select(self, X): fs = np.array(self._fs) X = np.array(X) _X = [] # perform selection for each input row for x in X: _X.append(x[fs]) return _X def predict(self, X): if self._fs: X = self.feature_select(X) return self._regressor.predict(X) return Model(regressor, feature_selection)
def conOutFinal(cfile, order): a = open(cfile.replace('.tocnf', '.possum'), 'w') try: specMODEL = skljson.from_json( pkg_resources.resource_filename( 'POSMM', 'models/species_' + str(order) + '.json')) genMODEL = skljson.from_json( pkg_resources.resource_filename( 'POSMM', 'models/genus_' + str(order) + '.json')) famMODEL = skljson.from_json( pkg_resources.resource_filename( 'POSMM', 'models/family_' + str(order) + '.json')) claMODEL = skljson.from_json( pkg_resources.resource_filename( 'POSMM', 'models/class_' + str(order) + '.json')) ordMODEL = skljson.from_json( pkg_resources.resource_filename( 'POSMM', 'models/order_' + str(order) + '.json')) phyMODEL = skljson.from_json( pkg_resources.resource_filename( 'POSMM', 'models/phylum_' + str(order) + '.json')) except: print('ERROR: Models Missing. Aborting') sys.exit() modlist = [phyMODEL, claMODEL, ordMODEL, famMODEL, genMODEL, specMODEL] matchpos = [] for m in modlist: mp = (list(m.classes_)).index('Match') matchpos.append(mp) with open(cfile) as infile: for lines in infile: lines = lines.rstrip() values = lines.split('\t') taxid = values[2] lineage = glin[taxid][1:] rawscore = float(values[1]) readlen = int(values[0]) wrstr = [] for x in range(len(modlist)): cfs = modlist[x].predict_proba([[rawscore, readlen]])[0][matchpos[x]] wrstr.append(lineage[x] + ':::' + str(cfs)) a.write('\t'.join(wrstr) + '\n') a.close()
def open_model(model_path): """Open and return a model from json file :param model_path: path to the model """ if model_path.endswith('gz'): with gzip.open(model_path, 'r') as f: model = f.read() model = json.loads(model.decode('utf-8')) model = from_dict(model) return model else: with open(model_path, 'r') as f: a = f.readline() if a.startswith('{"learner"'): model = xgb.Booster() model.load_model(model_path) return model else: model = from_json(model_path) return model
def predict(path, model, prefix="", out_dir=""): """ ---- path: str Path to serialized/pickeld training set model: sklearn model Path to serialized RandomForest classifier (trained) prefix: prefix for .csv file with prediction results and serialized model out_dir: path to create output folder. """ print("Loading model") print(model) try: clf = skljson.from_json(model) except: print("An error occured while importing the model from json") print("annotating fasta") data = export_matrix(name=prefix, fasta_path=path, out_path=out_dir) # Preprocessing data_ps = preprocess_and_scaledata(data, "llps") data_numeric = data_ps.select_dtypes([np.number]) X = data_numeric.drop("llps", axis=1) y = data_numeric["llps"] psap_prediction = pd.DataFrame(index=data["protein_name"]) psap_prediction["PSAP_score"] = clf.predict_proba(X)[:, 1] psap_prediction["llps"] = y.values psap_prediction["rank"] = 0 rank = psap_prediction.loc[psap_prediction["llps"] == 0, "PSAP_score"].rank(ascending=False) psap_prediction["rank"] = rank # # Make directory for output out_dir = Path(out_dir) out_dir.mkdir(parents=True, exist_ok=True) psap_prediction.to_csv(out_dir / f"prediction_{prefix}.csv")
# sklearn rf-model, serialize as json import sklearn_json as skljson from sklearn.ensemble import RandomForestClassifier from sklearn.datasets import load_wine from sklearn.model_selection import train_test_split ## load data wine = load_wine() ## split train/tmp3.json Xtrain, Xtest, Ytrain, Ytest = train_test_split(wine.data, wine.target, test_size=0.3) model = RandomForestClassifier(n_estimators=10, max_depth=5, random_state=0)\ .fit(Xtrain, Ytrain) ## to json skljson.to_json(model, "rf-model") ## from_json model2 = skljson.from_json("rf-model") score = model2.score(Xtest, Ytest) print(score)
import sklearn_json as skljson import pickle deserialized_model = skljson.from_json('lr_model.json') from sklearn.preprocessing import MinMaxScaler with open('scaler.json', 'rb') as f: scaler = pickle.load(f) deserialized_model.predict(scaler.transform([[152, 6.5, 8.5, 0.72]]))
explainer = shap.TreeExplainer(clf) shap_values = explainer.shap_values(X_train) vals = np.abs(shap_values[0]).mean(0) shap_importance = pd.Series(vals, index=X_train.columns).rename('shap') shap_importance.sort_values(ascending=False, inplace=True) imp = pd.concat([imp, shap_importance], axis=1) imp.to_csv('feature_importnace.csv') # Save model import sklearn_json as skljson file_name = os.path.join('models', 'random_forest.json') clf_ser = serialize_random_forest(clf) with open(file_name, 'w') as model_json: json.dump(clf_ser, model_json) deserialized_model = skljson.from_json(file_name) deserialized_model.predict(X_test) # add to dropbox # import dropbox # dbx = dropbox.Dropbox('RPqFmEm0LbUAAAAAAAAAAZ5Q4ZbVET-HQh18ixMUp6Gcx5lc0vMYMzMA2rueMjO6') # with open(file_name, 'rb') as f: # dbx.files_upload(f.read(), '/trend_labeling/' + file_name, mute = True) ### REFIT THE MODEL WITH MOST IMPORTANT FEATURES # fi_cols = shap_values['col_name'].head(keep_important_features) # X_train_important = X_train[fi_cols] # X_test_important = X_test[fi_cols] # clf = RandomForestClassifier(criterion='entropy', # max_features=keep_important_features, # min_weight_fraction_leaf=min_weight_fraction_leaf,
#pip install sklearn-json import sklearn_json as skljson file_name = "abc.json" skljson.to_json(model, file_name) deserialized_model = skljson.from_json('abc.json')
# read and save model in file import json import sklearn_json as skljson from sklearn.model_selection import train_test_split from sklearn.datasets import load_wine f = json.load(open("tree_model", mode='r', encoding='utf-8')) # print(f, type(f)) # print(f['meta']) # ====================================== # 读取json文件的model,使用model # ====================================== wine = load_wine() Xtrain, Xtest, Ytrain, Ytest = train_test_split(wine.data, wine.target, test_size=0.3) model = skljson.from_json("tree_model") print(model.score(Xtrain, Ytrain)) print(model.score(Xtest, Ytest)) print(model.predict(Xtest)) print(Ytest)