Exemple #1
0
def main(artifact_location: str, production_ready: bool = False) -> None:
    art_loc = ArtifactLocation(artifact_location)
    data_dict = load_and_preprocess_data(art_loc)

    if art_loc == ArtifactLocation.LOCAL:
        _ = train_and_persist(data_dict)

    elif art_loc == ArtifactLocation.S3:
        _ = train_and_persist(data_dict)
        s3 = boto3.client("s3")
        s3.upload_file(
            f"{os.getcwd()}/{Config.LOCAL_ARTIFACTS_PATH}/{Config.FEATURE_ENGINEERING_ARTIFACT}",
            Bucket=Config.BUCKET_NAME,
            Key=
            f"{Config.S3_ARTIFACTS_DIR}/{Config.FEATURE_ENGINEERING_ARTIFACT}",
        )
        s3.upload_file(
            f"{os.getcwd()}/{Config.LOCAL_ARTIFACTS_PATH}/{Config.CLASSIFIER_ARTIFACT}",
            Bucket=Config.BUCKET_NAME,
            Key=f"{Config.S3_ARTIFACTS_DIR}/{Config.CLASSIFIER_ARTIFACT}",
        )

    elif ArtifactLocation.S3_MLFLOW:
        mlflow.set_tracking_uri(Config.TRACKING_URI)

        # MLflow experiment tracking
        with mlflow.start_run(experiment_id=Config.EXPERIMENT_ID):
            training_metadata = train_and_persist(data_dict)
            logging.info(mlflow.get_artifact_uri())
            for k, v in training_metadata["params"][
                    "feature_engineering"].items():
                mlflow.log_param(str(k), str(v))
            for k, v in training_metadata["params"]["classifier"].items():
                mlflow.log_param(str(k), str(v))

            mlflow.log_metric("training accuracy",
                              training_metadata["accuracy"]["train"])
            mlflow.log_metric("test accuracy",
                              training_metadata["accuracy"]["test"])
            mlflow.log_artifact(
                f"{os.getcwd()}/{Config.LOCAL_ARTIFACTS_PATH}/{Config.FEATURE_ENGINEERING_ARTIFACT}"
            )
            mlflow.log_artifact(
                f"{os.getcwd()}/{Config.LOCAL_ARTIFACTS_PATH}/{Config.CLASSIFIER_ARTIFACT}"
            )

            if production_ready:
                mlflow.set_tag(Config.LIVE_TAG, 1)
            else:
                mlflow.set_tag(Config.LIVE_TAG, 0)
                mlflow.set_tag(Config.CANDIDATE_TAG, 1)

            # When running in Github actions set EXPERIMENT_ID as env
            # for consumption by the subsequent step
            print(f"::set-output name=EXPERIMENT_ID::{Config.EXPERIMENT_ID}")
from sklearn.tree import export_graphviz
import pydot
from sklearn.preprocessing import normalize
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from imblearn import under_sampling, over_sampling
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_selection import RFE

sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

features, features_list, labels = utils.load_and_preprocess_data()

# Using Skicit-learn to split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features,
                                                    labels,
                                                    test_size=0.2,
                                                    random_state=0)

isSMOTE = True
if isSMOTE:
    os = SMOTE(random_state=0)
    os_data_X, os_data_y = os.fit_sample(X_train, y_train)
    X_train = pd.DataFrame(data=os_data_X)
    y_train = pd.DataFrame(data=os_data_y)
    print("length of oversampled data is ", len(os_data_X))
    print(y_train[0].value_counts())