Beispiel #1
0
def main():
    df = training_dataframe(mongodb_uri=MONGO_URI)
    users = df['user_email'].unique()

    data = list()

    os.chdir(checkpoint_path)
    for user in users:
        # All the checkpoints to be stored in checkpoints path
        df = user_to_binary(df, user)

        if np.where(df.user == 1)[0].__len__() >= 10:

            X_train, X_test, Y_train, Y_test = obtain_features(dataframe=df,
                                                               random_state=42)

            # Aplicamos estandarización. Se guardará un fichero de estandarización en la carpeta checkpoints
            X_train = save_scaling(X_train)
            # Normalizamos el test dataset de acuerdo al training dataset sobre el que se ha hecho oversampling
            X_test = load_scaling(X_test)

            from sklearn.svm import SVC
            model = SVC()

            from pprint import pprint
            pprint(model.get_params())

            parameter_candidates = [{
                'C': [1, 2, 5, 10, 20, 30, 100, 1000],
                'kernel': ['linear']
            }, {
                'C': [1, 2, 5, 10, 20, 30, 100, 1000],
                'gamma': [20., 10., 5., 1., 0.1, 0.01, 0.001, 0.0001],
                'kernel': ['rbf']
            }]

            from sklearn.model_selection import GridSearchCV
            # Conduct Grid Search To Find Parameters Producing Highest Score
            rf_random = GridSearchCV(estimator=SVC(),
                                     param_grid=parameter_candidates,
                                     verbose=2,
                                     n_jobs=-1)
            # Train the classifier on X_train and Y_train
            rf_random.fit(X_train, Y_train)
            # A huge bunch of stuff comes up. To obtain the best parameters, we call:
            pprint(rf_random.best_params_)

            # Almacenamos la información
            info = {}
            info['user'] = user
            info['hyperparameters'] = rf_random.best_params_
            data.append(info)

            print('Best score for training_data:', rf_random.best_score_)
    os.chdir(basedir)

    os.chdir(logs_path)
    with open("svc_GridSearch.txt", "w") as myfile:
        json.dump(data, myfile)
    os.chdir(checkpoint_path)
def main():
    df = training_dataframe(mongodb_uri=MONGO_URI)
    users = df['user_email'].unique()

    data = list()

    os.chdir(checkpoint_path)
    for user in users:
        # All the checkpoints to be stored in checkpoints path

        df = user_to_binary(df, user)

        if np.where(df.user == 1)[0].__len__() >= 10:

            X_train, X_test, Y_train, Y_test = obtain_features(dataframe=df,
                                                               random_state=42)

            from sklearn.ensemble import RandomForestClassifier
            from sklearn.model_selection import RandomizedSearchCV
            model = RandomForestClassifier()

            from pprint import pprint
            pprint(model.get_params())

            hyperparameters = {
                'max_features': [None, 'auto', 'sqrt', 'log2'],
                'max_depth': [None, 1, 5, 10, 15, 20],
                'min_samples_leaf': [1, 2, 4, 6],
                'min_samples_split': [2, 4, 6, 8, 10],
                'n_estimators':
                [int(x) for x in np.linspace(start=10, stop=100, num=10)],
                'criterion': ['gini', 'entropy']
            }

            rf_random = RandomizedSearchCV(model,
                                           hyperparameters,
                                           n_iter=100,
                                           cv=5,
                                           verbose=2,
                                           random_state=42,
                                           n_jobs=-1)

            # Train the classifier on X_train and Y_train
            rf_random.fit(X_train, Y_train)
            # A huge bunch of stuff comes up. To obtain the best parameters, we call:
            pprint(rf_random.best_params_)

            # Almacenamos la información
            info = {}
            info['user'] = user
            info['hyperparameters'] = rf_random.best_params_
            data.append(info)

            print('Best score for training_data:', rf_random.best_score_)
    os.chdir(basedir)

    os.chdir(logs_path)
    with open("randomForest_GridSearch.txt", "w") as myfile:
        json.dump(data, myfile)
    os.chdir(basedir)
Beispiel #3
0
def main():
    df = training_dataframe(mongodb_uri=MONGO_URI)
    users = df['user_email'].unique()

    data = list()
    os.chdir(checkpoint_path)
    for user in users:
        # All the checkpoints to be stored in checkpoints path
        df = user_to_binary(df, user)

        if np.where(df.user==1)[0].__len__() >= 10:

            X_train, X_test, Y_train, Y_test = obtain_features(dataframe=df, random_state=42)

            # Aplicamos estandarización. Se guardará un fichero de 
            # estandarización en la carpeta checkpoints
            X_train = save_scaling(X_train)
            # Normalizamos el test dataset de acuerdo al training dataset 
            # sobre el que se ha hecho oversampling
            X_test = load_scaling(X_test)

            from sklearn.linear_model import LogisticRegression
            model = LogisticRegression()

            from pprint import pprint
            pprint(model.get_params())

            parameter_candidates = [
                {
                'C':    [1, 10, 20, 30, 40 ,50, 60 ,70, 80, 90, 100], 
                'solver': ['newton-cg', 'lbfgs', 'sag'],
                'penalty': ['l2'],
                'class_weight': [None, 'balanced']
                },
                {
                'C':    [1, 10, 20, 30, 40 ,50, 60 ,70, 80, 90, 100], 
                'solver': ['warn', 'liblinear', 'saga'],
                'penalty': ['l2'],
                'class_weight': [None, 'balanced']
                }
            ]

            from sklearn.model_selection import GridSearchCV
            # Conduct Grid Search To Find Parameters Producing Highest Score
            rf_random = GridSearchCV(estimator=LogisticRegression(), 
                                    param_grid=parameter_candidates, 
                                    verbose=2, 
                                    n_jobs=-1)
            # Train the classifier on X_train and Y_train
            rf_random.fit(X_train, Y_train)
            # A huge bunch of stuff comes up. To obtain the best parameters, we call:
            pprint(rf_random.best_params_)

            # Almacenamos la información
            info = {}
            info['user'] = user
            info['hyperparameters'] = rf_random.best_params_
            data.append(info)

            print('Best score for training_data:', rf_random.best_score_)
    os.chdir(basedir)

    # Guardamos todo en un fichero .txt al que después podamos acceder
    os.chdir(logs_path)
    with open("logRegr_GridSearch.txt", "w") as myfile:
        json.dump(data, myfile)
    os.chdir(basedir)
Beispiel #4
0
for user in users:

    p = list()
    r = list()
    f1 = list()

    data = user_to_binary(df, user)

    if np.where(data.user == 1)[0].__len__() >= 10:

        for _ in range(n_loops):

            try:
                # Realizamos la partición del dataset
                X_train, X_test, Y_train, Y_test = obtain_features(
                    dataframe=data, random_state=random.randint(1, 100))
            except ValueError:
                # Realizamos la partición del dataset
                X_train, X_test, Y_train, Y_test = obtain_features(
                    dataframe=data, random_state=random.randint(1, 100))

            if model != 'RandomForest':
                # Normalizamos el test dataset de acuerdo al training
                # # dataset sobre el que se ha hecho oversampling
                X_test = load_scaling(X_test)

            if model == 'logRegr':
                filename = 'logistic_regression/' + user + '.sav'
            elif model == 'svc':
                filename = 'support_vector_classifier/' + user + '.sav'
            elif model == 'RandomForest':
Beispiel #5
0
def callback(ch, method, properties, body):
    # Training of the model is launched
    df = training_dataframe(mongodb_uri=MONGO_URI)
    users = df['user_email'].unique()

    for model in models:
        print('Lanzando GridSearch de Hiperparámetros para modelo ', model)
        if model == 'logRegr':
            os.system("python gridsearch_logRegr.py")
            # Cargamos los parámetros idóneos para cada usuario en un json
            os.chdir(logs_path)
            with open('logRegr_GridSearch.txt', mode='r',
                      encoding='utf-8') as f:
                grid_search = json.load(f)
            os.chdir(basedir)
        elif model == 'svc':
            os.system("python gridsearch_svc.py")
            # Cargamos los parámetros idóneos para cada usuario en un json
            os.chdir(logs_path)
            with open('svc_GridSearch.txt', mode='r', encoding='utf-8') as f:
                grid_search = json.load(f)
            os.chdir(basedir)
        elif model == 'RandomForest':
            os.system("python gridsearch_randomForest.py")
            os.chdir(logs_path)
            with open('randomForest_GridSearch.txt',
                      mode='r',
                      encoding='utf-8') as f:
                grid_search = json.load(f)
            os.chdir(basedir)
        # All the checkpoints to be stored in checkpoints path
        os.chdir(checkpoint_path)
        for user in users:
            print('Comenzando entrenamiento del algortimo ', model,
                  ' para usuario ', user)
            # Clasificación binaria para cada usuario
            data = user_to_binary(df, user)
            # Aplicamos estandarización. Se guardará un fichero de estandarización en la carpeta checkpoints
            X_train, X_test, Y_train, Y_test = obtain_features(dataframe=data,
                                                               random_state=42)
            if model != 'RandomForest':
                X_train = save_scaling(X_train)
                # Normalizamos el test dataset de acuerdo al training dataset
                X_test = load_scaling(X_test)
            # Nos quedamos sólo con los hiperparámetros del usuario que nos interesan
            for item in grid_search:
                if item['user'] == user:
                    info = item['hyperparameters']

            # The training is launched for user
            model_training(x_train=X_train,
                           x_test=X_test,
                           y_train=Y_train,
                           y_test=Y_test,
                           user=user,
                           model=model,
                           info=info)

        os.chdir(basedir)

    # Connection to MongoDB is established
    client = MongoClient(MONGO_URI),
    # Getting a Database and parsing the name of the database from the MONGO_URI
    o = urlparse(MONGO_URI).path[1::]
    db = client[o]
    # Once training is finished, the user status that triggered training
    # is set to authenticable
    db.users.update_one(json.loads(body), {'$set': {'authenticable': True}})
                          ,
                          [
                           '*****@*****.**'
                          ], 
                          inplace=True)

df_onehot = pd.concat([df,pd.get_dummies(df['user_email'], prefix='user_email')], axis=1)
df_onehot.drop(['user_email'],axis=1, inplace=True)
df_onehot.to_csv('dataframe_onehot.csv')


users = df['user_email'].unique()
print(users)


for user in users:

    data = user_to_binary(df, user=user)

    X_train, X_test, Y_train, Y_test = obtain_features(dataframe=data, random_state=42)


    feat = pd.concat([X_train, X_test], ignore_index=True)
    labels = pd.concat([Y_train, Y_test], ignore_index=True)

    dataframe = pd.concat([feat, labels], axis=1)

    dataframe.to_csv('cubeauth_{}.csv'.format(user))