Ejemplo n.º 1
0
def main():
    df = training_dataframe(mongodb_uri=MONGO_URI)
    users = df['user_email'].unique()

    data = list()

    os.chdir(checkpoint_path)
    for user in users:
        # All the checkpoints to be stored in checkpoints path

        df = user_to_binary(df, user)

        if np.where(df.user == 1)[0].__len__() >= 10:

            X_train, X_test, Y_train, Y_test = obtain_features(dataframe=df,
                                                               random_state=42)

            from sklearn.ensemble import RandomForestClassifier
            from sklearn.model_selection import RandomizedSearchCV
            model = RandomForestClassifier()

            from pprint import pprint
            pprint(model.get_params())

            hyperparameters = {
                'max_features': [None, 'auto', 'sqrt', 'log2'],
                'max_depth': [None, 1, 5, 10, 15, 20],
                'min_samples_leaf': [1, 2, 4, 6],
                'min_samples_split': [2, 4, 6, 8, 10],
                'n_estimators':
                [int(x) for x in np.linspace(start=10, stop=100, num=10)],
                'criterion': ['gini', 'entropy']
            }

            rf_random = RandomizedSearchCV(model,
                                           hyperparameters,
                                           n_iter=100,
                                           cv=5,
                                           verbose=2,
                                           random_state=42,
                                           n_jobs=-1)

            # Train the classifier on X_train and Y_train
            rf_random.fit(X_train, Y_train)
            # A huge bunch of stuff comes up. To obtain the best parameters, we call:
            pprint(rf_random.best_params_)

            # Almacenamos la información
            info = {}
            info['user'] = user
            info['hyperparameters'] = rf_random.best_params_
            data.append(info)

            print('Best score for training_data:', rf_random.best_score_)
    os.chdir(basedir)

    os.chdir(logs_path)
    with open("randomForest_GridSearch.txt", "w") as myfile:
        json.dump(data, myfile)
    os.chdir(basedir)
Ejemplo n.º 2
0
def main():
    df = training_dataframe(mongodb_uri=MONGO_URI)
    users = df['user_email'].unique()

    data = list()

    os.chdir(checkpoint_path)
    for user in users:
        # All the checkpoints to be stored in checkpoints path
        df = user_to_binary(df, user)

        if np.where(df.user == 1)[0].__len__() >= 10:

            X_train, X_test, Y_train, Y_test = obtain_features(dataframe=df,
                                                               random_state=42)

            # Aplicamos estandarización. Se guardará un fichero de estandarización en la carpeta checkpoints
            X_train = save_scaling(X_train)
            # Normalizamos el test dataset de acuerdo al training dataset sobre el que se ha hecho oversampling
            X_test = load_scaling(X_test)

            from sklearn.svm import SVC
            model = SVC()

            from pprint import pprint
            pprint(model.get_params())

            parameter_candidates = [{
                'C': [1, 2, 5, 10, 20, 30, 100, 1000],
                'kernel': ['linear']
            }, {
                'C': [1, 2, 5, 10, 20, 30, 100, 1000],
                'gamma': [20., 10., 5., 1., 0.1, 0.01, 0.001, 0.0001],
                'kernel': ['rbf']
            }]

            from sklearn.model_selection import GridSearchCV
            # Conduct Grid Search To Find Parameters Producing Highest Score
            rf_random = GridSearchCV(estimator=SVC(),
                                     param_grid=parameter_candidates,
                                     verbose=2,
                                     n_jobs=-1)
            # Train the classifier on X_train and Y_train
            rf_random.fit(X_train, Y_train)
            # A huge bunch of stuff comes up. To obtain the best parameters, we call:
            pprint(rf_random.best_params_)

            # Almacenamos la información
            info = {}
            info['user'] = user
            info['hyperparameters'] = rf_random.best_params_
            data.append(info)

            print('Best score for training_data:', rf_random.best_score_)
    os.chdir(basedir)

    os.chdir(logs_path)
    with open("svc_GridSearch.txt", "w") as myfile:
        json.dump(data, myfile)
    os.chdir(checkpoint_path)
Ejemplo n.º 3
0
def main():
    df = training_dataframe(mongodb_uri=MONGO_URI)
    users = df['user_email'].unique()

    data = list()
    os.chdir(checkpoint_path)
    for user in users:
        # All the checkpoints to be stored in checkpoints path
        df = user_to_binary(df, user)

        if np.where(df.user==1)[0].__len__() >= 10:

            X_train, X_test, Y_train, Y_test = obtain_features(dataframe=df, random_state=42)

            # Aplicamos estandarización. Se guardará un fichero de 
            # estandarización en la carpeta checkpoints
            X_train = save_scaling(X_train)
            # Normalizamos el test dataset de acuerdo al training dataset 
            # sobre el que se ha hecho oversampling
            X_test = load_scaling(X_test)

            from sklearn.linear_model import LogisticRegression
            model = LogisticRegression()

            from pprint import pprint
            pprint(model.get_params())

            parameter_candidates = [
                {
                'C':    [1, 10, 20, 30, 40 ,50, 60 ,70, 80, 90, 100], 
                'solver': ['newton-cg', 'lbfgs', 'sag'],
                'penalty': ['l2'],
                'class_weight': [None, 'balanced']
                },
                {
                'C':    [1, 10, 20, 30, 40 ,50, 60 ,70, 80, 90, 100], 
                'solver': ['warn', 'liblinear', 'saga'],
                'penalty': ['l2'],
                'class_weight': [None, 'balanced']
                }
            ]

            from sklearn.model_selection import GridSearchCV
            # Conduct Grid Search To Find Parameters Producing Highest Score
            rf_random = GridSearchCV(estimator=LogisticRegression(), 
                                    param_grid=parameter_candidates, 
                                    verbose=2, 
                                    n_jobs=-1)
            # Train the classifier on X_train and Y_train
            rf_random.fit(X_train, Y_train)
            # A huge bunch of stuff comes up. To obtain the best parameters, we call:
            pprint(rf_random.best_params_)

            # Almacenamos la información
            info = {}
            info['user'] = user
            info['hyperparameters'] = rf_random.best_params_
            data.append(info)

            print('Best score for training_data:', rf_random.best_score_)
    os.chdir(basedir)

    # Guardamos todo en un fichero .txt al que después podamos acceder
    os.chdir(logs_path)
    with open("logRegr_GridSearch.txt", "w") as myfile:
        json.dump(data, myfile)
    os.chdir(basedir)
Ejemplo n.º 4
0
MONGO_URI = os.environ.get(
    'MONGODB_URI',
    'mongodb://*****:*****@ds143070.mlab.com:43070/cubeauth')
basedir = os.path.abspath(os.path.dirname(__file__))
checkpoint_path = os.path.join(basedir, 'checkpoints')
logistic_regression_path = os.path.join(checkpoint_path, 'logistic_regression')
support_vector_classifier_path = os.path.join(checkpoint_path,
                                              'support_vector_classifier')
random_forest_path = os.path.join(checkpoint_path, 'random_forest')
logs_path = os.path.join(basedir, 'logs')

# Following models to be supported
model = 'svc'  #['logRegr', 'svc', 'RandomForest']

# Loading dataframe from database
df = training_dataframe(mongodb_uri=MONGO_URI)
# Users involved in the experiment so far
users = df['user_email'].unique()

n_loops = 5

os.chdir(checkpoint_path)

for user in users:

    p = list()
    r = list()
    f1 = list()

    data = user_to_binary(df, user)
Ejemplo n.º 5
0
def callback(ch, method, properties, body):
    # Training of the model is launched
    df = training_dataframe(mongodb_uri=MONGO_URI)
    users = df['user_email'].unique()

    for model in models:
        print('Lanzando GridSearch de Hiperparámetros para modelo ', model)
        if model == 'logRegr':
            os.system("python gridsearch_logRegr.py")
            # Cargamos los parámetros idóneos para cada usuario en un json
            os.chdir(logs_path)
            with open('logRegr_GridSearch.txt', mode='r',
                      encoding='utf-8') as f:
                grid_search = json.load(f)
            os.chdir(basedir)
        elif model == 'svc':
            os.system("python gridsearch_svc.py")
            # Cargamos los parámetros idóneos para cada usuario en un json
            os.chdir(logs_path)
            with open('svc_GridSearch.txt', mode='r', encoding='utf-8') as f:
                grid_search = json.load(f)
            os.chdir(basedir)
        elif model == 'RandomForest':
            os.system("python gridsearch_randomForest.py")
            os.chdir(logs_path)
            with open('randomForest_GridSearch.txt',
                      mode='r',
                      encoding='utf-8') as f:
                grid_search = json.load(f)
            os.chdir(basedir)
        # All the checkpoints to be stored in checkpoints path
        os.chdir(checkpoint_path)
        for user in users:
            print('Comenzando entrenamiento del algortimo ', model,
                  ' para usuario ', user)
            # Clasificación binaria para cada usuario
            data = user_to_binary(df, user)
            # Aplicamos estandarización. Se guardará un fichero de estandarización en la carpeta checkpoints
            X_train, X_test, Y_train, Y_test = obtain_features(dataframe=data,
                                                               random_state=42)
            if model != 'RandomForest':
                X_train = save_scaling(X_train)
                # Normalizamos el test dataset de acuerdo al training dataset
                X_test = load_scaling(X_test)
            # Nos quedamos sólo con los hiperparámetros del usuario que nos interesan
            for item in grid_search:
                if item['user'] == user:
                    info = item['hyperparameters']

            # The training is launched for user
            model_training(x_train=X_train,
                           x_test=X_test,
                           y_train=Y_train,
                           y_test=Y_test,
                           user=user,
                           model=model,
                           info=info)

        os.chdir(basedir)

    # Connection to MongoDB is established
    client = MongoClient(MONGO_URI),
    # Getting a Database and parsing the name of the database from the MONGO_URI
    o = urlparse(MONGO_URI).path[1::]
    db = client[o]
    # Once training is finished, the user status that triggered training
    # is set to authenticable
    db.users.update_one(json.loads(body), {'$set': {'authenticable': True}})