def get_standard_scaler(version=1): """Retrieves StandardScaler from scikit package. Scaler is first trained and then preserved in MongoDb. Once preserved Scaler is retrieved from MongoDb on all following function calls. Returns: scaler: sklearn.preprocessing StandardScaler """ system = dl.read_dict_from_mongo("LC", "LC_SYSTEM", {"version": version}) if "scaler_preserved" in system and system["scaler_preserved"]: print("--StandardScaller already preserved.") return pickle.loads(system["scaler_model_bin"]) else: print("--StandardScaller trained and preserved for the first time.") #load train data train_data, _ = get_lc_train_data() #train and preserve scaler scaler = StandardScaler().fit(train_data) system["scaler_model_bin"] = pickle.dumps(scaler) system["scaler_preserved"] = True dl.update_mongo_collection(system, "LC", "LC_SYSTEM", {"version": version}) return scaler
def persisted_models_not_valid(system_query={}): """Checks if state of persisted models is valid. If train data set is changed since models were trained their state is not valid and they need to be trained again to reflect latest dataset. Returns: bool: True if models are not valid, False if models are valid. """ train_collection_count = dl.get_collection_size(db="LC", collection="LC_TRAIN_DATA") system = dl.read_dict_from_mongo(db="LC", collection="LC_SYSTEM", query=system_query) if "train_collection_count" in system: return int(system["train_collection_count"]) != int(train_collection_count) return False
def get_randomforest_classifier(n_trees=500, version=1): """Retrieves Random Forest classifier from scikit package. Classifier is first trained and then preserved on file system. Once preserved classifier is retrieved from file system on all following function calls. Classifier is trained on training data first standardized with StandardScaler and then transformed with PCA(n_components=50) Args: n_trees: Number of trees for Random Forest classifier. Returns: rf_clf: sklearn.ensemble import RandomForestClassifier """ system = dl.read_dict_from_mongo("LC", "LC_SYSTEM", {"version": version}) if "rf{0}_preserved".format(n_trees) in system and system["rf{0}_preserved".format(n_trees)]: print("--RandomForest with {0} trees already preserved.".format(n_trees)) with open(system["rf{0}_model_disk_path".format(n_trees)], "rb") as model_file: return pickle.load(model_file) else: print("--Random Forest with {0} trees trained and preserved for the first time.".format(n_trees)) #load train data train_data, target_classes = get_lc_train_data() #load scaler and standardize data scaler = get_standard_scaler(version=version) standardized_train_data = scaler.transform(train_data) #load PCA and transform data pca = get_pca_scaler(n_components=50, version=version) transformed_train_data = pca.transform(standardized_train_data) #train and preserve RandomForest rf_clf = RandomForestClassifier(n_estimators=n_trees) rf_clf.fit(transformed_train_data, target_classes) model_file_path = _get_custom_clf_model_file_path("rf{0}_model_v{1}".format(n_trees, version)) with open(model_file_path, "wb") as model_file: pickle.dump(rf_clf, model_file) system["rf{0}_preserved".format(n_trees)] = True system["rf{0}_model_disk_path".format(n_trees)] = model_file_path dl.update_mongo_collection(system, "LC", "LC_SYSTEM", {"version": version}) return rf_clf
def get_kneighbors_classifier(version=1): """Retrieves KNeighbors classifier from scikit package. Classifier is first trained and then preserved on file system. Once preserved classifier is retrieved from file system on all following function calls. Classifier is trained on training data first standardized with StandardScaler and then transformed with PCA(n_components=50) Args: n_trees: Number of trees for Random Forest classifier. Returns: kn_clf: sklearn.neighbors KNeighborsClassifier """ system = dl.read_dict_from_mongo("LC", "LC_SYSTEM", {"version": version}) if "kn_preserved" in system and system["kn_preserved"]: print("--KNeighbors already preserved.") with open(system['kn_model_disk_path'], 'rb') as model_file: return pickle.load(model_file) else: print("--KNeighbors trained and preserved for the first time.") #load train data train_data, target_classes = get_lc_train_data() #load scaler and transform data scaler = get_standard_scaler(version=version) standardized_train_data = scaler.transform(train_data) # load PCA and transform data pca = get_pca_scaler(n_components=50, version=version) transformed_train_data = pca.transform(standardized_train_data) #train and preserve KNeighbors kn_clf = KNeighborsClassifier() kn_clf.fit(transformed_train_data, target_classes) model_file_path = _get_custom_clf_model_file_path("kn_model_v{0}".format(version)) with open(model_file_path, "wb") as model_file: pickle.dump(kn_clf, model_file) system["kn_preserved"] = True system["kn_model_disk_path"] = model_file_path dl.update_mongo_collection(system, "LC", "LC_SYSTEM", {"version": version}) return kn_clf
def get_decisiontree_classifier(version=1): """Retrieves Decision Tree classifier from scikit package. Classifier is first trained and then preserved on file system. Once preserved classifier is retrieved from file system on all following function calls. Classifier is trained on training data first standardized with StandardScaler and then transformed with PCA(n_components=10) Returns: dt_clf: sklearn.tree DecisionTreeClassifier """ system = dl.read_dict_from_mongo("LC", "LC_SYSTEM", {"version": version}) if "dt_preserved" in system and system["dt_preserved"]: print("--Decision Tree model already preserved.") with open(system["dt_model_disk_path"], "rb") as model_file: return pickle.load(model_file) else: print("--DecisionTree trained and preserved for the first time.") #load train data train_data, target_classes = get_lc_train_data() #load scaler and transform data scaler = get_standard_scaler(version=version) standardized_train_data = scaler.transform(train_data) #load PCA and transform data pca = get_pca_scaler(n_components=10, version=version) transformed_train_data = pca.transform(standardized_train_data) #train and preserve Decision Tree dt_clf = DecisionTreeClassifier() dt_clf.fit(transformed_train_data, target_classes) model_file_path = _get_custom_clf_model_file_path("dt_model_v{0}".format(version)) with open(model_file_path, "wb") as model_file: pickle.dump(dt_clf, model_file) system["dt_preserved"] = True system["dt_model_disk_path"] = model_file_path dl.update_mongo_collection(system, "LC", "LC_SYSTEM", {"version": version}) return dt_clf
def get_pca_scaler(n_components, svd_solver="auto", version=1): """Retrieves PCA from scikit package. PCA is first trained and then preserved on file system. Once preserved PCA is retrieved from file system on all following function calls. PCA is trained on training data first standardized with StandardScaler. Args: n_components: Number of components to retrieve with PCA. svd_solver: PCA svd_solver Returns: pca: sklearn.decomposition PCA """ system = dl.read_dict_from_mongo("LC", "LC_SYSTEM", {"version": version}) if str("pca{0}_preserved".format(n_components)) in system and system["pca{0}_preserved".format(n_components)]: print("--PCA (n_components={0}) already preserved.".format(n_components)) with open(system["pca" + str(n_components) + "_model_disk_path"], "rb") as model_file: return pickle.load(model_file) else: print("--PCA (n_components={0}) trained and preserved for the first time.".format(n_components)) # load train data train_data, _ = get_lc_train_data() # load scaler and transform data scaler = get_standard_scaler(version=version) standardized_train_data = scaler.transform(train_data) # train and preserve PCA pca = PCA(n_components=n_components, svd_solver=svd_solver) pca.fit(standardized_train_data) model_file_path = _get_custom_clf_model_file_path("pca{0}_model_v{1}".format(n_components, version)) with open(model_file_path, "wb") as model_file: pickle.dump(pca, model_file) system["pca{0}_preserved".format(n_components)] = True system["pca{0}_model_disk_path".format(n_components)] = model_file_path dl.update_mongo_collection(system, "LC", "LC_SYSTEM", {"version": version}) return pca
def get_naivebayes_classifier(version=1): """Retrieves Naive Bayes classifier from scikit package. Classifier is first trained and then preserved on file system. Once preserved classifier is retrieved from file system on all following function calls. Classifier is trained on training data first standardized with StandardScaler and then transformed with PCA(n_components=20) Returns: nb_clf: sklearn.naive_bayes GaussianNB """ system = dl.read_dict_from_mongo("LC", "LC_SYSTEM", {"version": version}) if "nb_preserved" in system and system['nb_preserved']: print("--Naive Bayes already preserved.") with open(system['nb_model_disk_path'], 'rb') as model_file: return pickle.load(model_file) else: print("--Naive Bayes trained and preserved for the first time.") #load train data train_data, target_classes = get_lc_train_data() #load scaler and transform data scaler = get_standard_scaler(version=version) standardized_train_data = scaler.transform(train_data) # load PCA and transform data pca = get_pca_scaler(n_components=20, version=version) transformed_train_data = pca.transform(standardized_train_data) #train and preserve Naive Bayes nb_clf = GaussianNB() nb_clf.fit(transformed_train_data, target_classes) model_file_path = _get_custom_clf_model_file_path("nb_model{0}".format(version)) with open(model_file_path, "wb") as model_file: pickle.dump(nb_clf, model_file) system["nb_preserved"] = True system["nb_model_disk_path"] = model_file_path dl.update_mongo_collection(system, "LC", "LC_SYSTEM", {"version": version}) return nb_clf