Beispiel #1
0
def get_standard_scaler(version=1):
    """Retrieves StandardScaler from scikit package. Scaler is first trained and then preserved in MongoDb.
        Once preserved Scaler is retrieved from MongoDb on all following function calls.

        Returns:
            scaler: sklearn.preprocessing StandardScaler
    """
    system = dl.read_dict_from_mongo("LC", "LC_SYSTEM", {"version": version})

    if "scaler_preserved" in system and system["scaler_preserved"]:
        print("--StandardScaller already preserved.")

        return pickle.loads(system["scaler_model_bin"])
    else:
        print("--StandardScaller trained and preserved for the first time.")

        #load train data
        train_data, _ = get_lc_train_data()

        #train and preserve scaler
        scaler = StandardScaler().fit(train_data)

        system["scaler_model_bin"] = pickle.dumps(scaler)
        system["scaler_preserved"] = True
        dl.update_mongo_collection(system, "LC", "LC_SYSTEM", {"version": version})

        return scaler
Beispiel #2
0
def get_randomforest_classifier(n_trees=500, version=1):
    """Retrieves Random Forest classifier from scikit package. Classifier is first trained and 
        then preserved on file system. Once preserved classifier is retrieved from file system on all following function calls.
        Classifier is trained on training data first standardized with StandardScaler and then transformed with PCA(n_components=50)
        
        Args:
            n_trees: Number of trees for Random Forest classifier.
        
        Returns:
            rf_clf: sklearn.ensemble import RandomForestClassifier
    """

    system = dl.read_dict_from_mongo("LC", "LC_SYSTEM", {"version": version})

    if "rf{0}_preserved".format(n_trees) in system and system["rf{0}_preserved".format(n_trees)]:
        print("--RandomForest with {0} trees already preserved.".format(n_trees))

        with open(system["rf{0}_model_disk_path".format(n_trees)], "rb") as model_file:
            return pickle.load(model_file)
    else:
        print("--Random Forest with {0} trees trained and preserved for the first time.".format(n_trees))

        #load train data
        train_data, target_classes = get_lc_train_data()

        #load scaler and standardize data
        scaler = get_standard_scaler(version=version)
        standardized_train_data = scaler.transform(train_data)

        #load PCA and transform data
        pca = get_pca_scaler(n_components=50, version=version)
        transformed_train_data = pca.transform(standardized_train_data)

        #train and preserve RandomForest
        rf_clf = RandomForestClassifier(n_estimators=n_trees)
        rf_clf.fit(transformed_train_data, target_classes)
        model_file_path = _get_custom_clf_model_file_path("rf{0}_model_v{1}".format(n_trees, version))
        with open(model_file_path, "wb") as model_file:
            pickle.dump(rf_clf, model_file)

        system["rf{0}_preserved".format(n_trees)] = True
        system["rf{0}_model_disk_path".format(n_trees)] = model_file_path
        dl.update_mongo_collection(system, "LC", "LC_SYSTEM", {"version": version})

        return rf_clf
Beispiel #3
0
def get_kneighbors_classifier(version=1):
    """Retrieves KNeighbors classifier from scikit package. Classifier is first trained and 
        then preserved on file system. Once preserved classifier is retrieved from file system on all following function calls.
        Classifier is trained on training data first standardized with StandardScaler and then transformed with PCA(n_components=50)

        Args:
            n_trees: Number of trees for Random Forest classifier.

        Returns:
            kn_clf: sklearn.neighbors KNeighborsClassifier
    """
    system = dl.read_dict_from_mongo("LC", "LC_SYSTEM", {"version": version})

    if "kn_preserved" in system and system["kn_preserved"]:
        print("--KNeighbors already preserved.")

        with open(system['kn_model_disk_path'], 'rb') as model_file:
            return pickle.load(model_file)
    else:
        print("--KNeighbors trained and preserved for the first time.")

        #load train data
        train_data, target_classes = get_lc_train_data()

        #load scaler and transform data
        scaler = get_standard_scaler(version=version)
        standardized_train_data = scaler.transform(train_data)

        # load PCA and transform data
        pca = get_pca_scaler(n_components=50, version=version)
        transformed_train_data = pca.transform(standardized_train_data)

        #train and preserve KNeighbors
        kn_clf = KNeighborsClassifier()
        kn_clf.fit(transformed_train_data, target_classes)
        model_file_path = _get_custom_clf_model_file_path("kn_model_v{0}".format(version))
        with open(model_file_path, "wb") as model_file:
            pickle.dump(kn_clf, model_file)

        system["kn_preserved"] = True
        system["kn_model_disk_path"] = model_file_path
        dl.update_mongo_collection(system, "LC", "LC_SYSTEM", {"version": version})

        return kn_clf
Beispiel #4
0
def get_decisiontree_classifier(version=1):
    """Retrieves Decision Tree classifier from scikit package. Classifier is first trained and 
        then preserved on file system. Once preserved classifier is retrieved from file system on all following function calls.
        Classifier is trained on training data first standardized with StandardScaler and then transformed with PCA(n_components=10)
        
        Returns:
            dt_clf: sklearn.tree DecisionTreeClassifier
    """
    system = dl.read_dict_from_mongo("LC", "LC_SYSTEM", {"version": version})

    if "dt_preserved" in system and system["dt_preserved"]:
        print("--Decision Tree model already preserved.")

        with open(system["dt_model_disk_path"], "rb") as model_file:
            return pickle.load(model_file)
    else:
        print("--DecisionTree trained and preserved for the first time.")

        #load train data
        train_data, target_classes = get_lc_train_data()

        #load scaler and transform data
        scaler = get_standard_scaler(version=version)
        standardized_train_data = scaler.transform(train_data)

        #load PCA and transform data
        pca = get_pca_scaler(n_components=10, version=version)
        transformed_train_data = pca.transform(standardized_train_data)

        #train and preserve Decision Tree
        dt_clf = DecisionTreeClassifier()
        dt_clf.fit(transformed_train_data, target_classes)
        model_file_path = _get_custom_clf_model_file_path("dt_model_v{0}".format(version))
        with open(model_file_path, "wb") as model_file:
            pickle.dump(dt_clf, model_file)

        system["dt_preserved"] = True
        system["dt_model_disk_path"] = model_file_path
        dl.update_mongo_collection(system, "LC", "LC_SYSTEM", {"version": version})

        return dt_clf
Beispiel #5
0
def get_pca_scaler(n_components, svd_solver="auto", version=1):
    """Retrieves PCA from scikit package. PCA is first trained and 
        then preserved on file system. Once preserved PCA is retrieved from file system on all following function calls.
        PCA is trained on training data first standardized with StandardScaler.
        
        Args:
            n_components: Number of components to retrieve with PCA.
            svd_solver: PCA svd_solver

        Returns:
            pca: sklearn.decomposition PCA
    """
    system = dl.read_dict_from_mongo("LC", "LC_SYSTEM", {"version": version})

    if str("pca{0}_preserved".format(n_components)) in system and system["pca{0}_preserved".format(n_components)]:
        print("--PCA (n_components={0}) already preserved.".format(n_components))

        with open(system["pca" + str(n_components) + "_model_disk_path"], "rb") as model_file:
            return pickle.load(model_file)
    else:
        print("--PCA (n_components={0}) trained and preserved for the first time.".format(n_components))

        # load train data
        train_data, _ = get_lc_train_data()

        # load scaler and transform data
        scaler = get_standard_scaler(version=version)
        standardized_train_data = scaler.transform(train_data)

        # train and preserve PCA
        pca = PCA(n_components=n_components, svd_solver=svd_solver)
        pca.fit(standardized_train_data)
        model_file_path = _get_custom_clf_model_file_path("pca{0}_model_v{1}".format(n_components, version))
        with open(model_file_path, "wb") as model_file:
            pickle.dump(pca, model_file)

        system["pca{0}_preserved".format(n_components)] = True
        system["pca{0}_model_disk_path".format(n_components)] = model_file_path
        dl.update_mongo_collection(system, "LC", "LC_SYSTEM", {"version": version})

        return pca
Beispiel #6
0
def get_naivebayes_classifier(version=1):
    """Retrieves Naive Bayes classifier from scikit package. Classifier is first trained and 
        then preserved on file system. Once preserved classifier is retrieved from file system on all following function calls.
        Classifier is trained on training data first standardized with StandardScaler and then transformed with PCA(n_components=20)

        Returns:
            nb_clf: sklearn.naive_bayes GaussianNB
    """
    system = dl.read_dict_from_mongo("LC", "LC_SYSTEM", {"version": version})

    if "nb_preserved" in system and system['nb_preserved']:
        print("--Naive Bayes already preserved.")

        with open(system['nb_model_disk_path'], 'rb') as model_file:
            return pickle.load(model_file)
    else:
        print("--Naive Bayes trained and preserved for the first time.")

        #load train data
        train_data, target_classes = get_lc_train_data()

        #load scaler and transform data
        scaler = get_standard_scaler(version=version)
        standardized_train_data = scaler.transform(train_data)

        # load PCA and transform data
        pca = get_pca_scaler(n_components=20, version=version)
        transformed_train_data = pca.transform(standardized_train_data)

        #train and preserve Naive Bayes
        nb_clf = GaussianNB()
        nb_clf.fit(transformed_train_data, target_classes)
        model_file_path = _get_custom_clf_model_file_path("nb_model{0}".format(version))
        with open(model_file_path, "wb") as model_file:
            pickle.dump(nb_clf, model_file)

        system["nb_preserved"] = True
        system["nb_model_disk_path"] = model_file_path
        dl.update_mongo_collection(system, "LC", "LC_SYSTEM", {"version": version})

        return nb_clf