Exemple #1
0
def train(df, fit_file):
    print("Training...")
    train_size = 0.75
    vectorizer = CountVectorizer(analyzer="word",
                                 tokenizer=None,
                                 preprocessor=None,
                                 stop_words=None)
    clf = automl.AutoSklearnClassifier(include_preprocessors=[
        "no_preprocessing",
    ],
                                       exclude_preprocessors=None)
    encoder = LabelEncoder()

    y = df.categoria
    y = encoder.fit_transform(y)

    X = vectorizer.fit_transform(df.nome_desc)

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=train_size)
    clf.fit(X_train, y_train)

    accuracy = clf.score(X_test, y_test)
    msg = "Accuracy with {:.0%} of testing data: {:.1%}".format(
        1 - train_size, accuracy)
    print(msg)
    joblib.dump(clf, fit_file)
    joblib.dump(encoder, 'encoder_%s' % fit_file)
    joblib.dump(vectorizer, 'vectorizer_%s' % fit_file)
Exemple #2
0
def train_autosklearn(alldata, labels, mtype, jsonfile, problemtype, default_features):

    foldername=jsonfile[0:-5]+'_autosklearn_%s'%(default_features)
    X_train, X_test, y_train, y_test = train_test_split(alldata, 
                                                        labels,
                                                        train_size=0.750,
                                                        test_size=0.250,
                                                        random_state=42,
                                                        shuffle=True)
    feature_types = (['numerical'] * len(X_train[0]))

    automl = asklc.AutoSklearnClassifier(
        time_left_for_this_task=60,
        per_run_time_limit=300,
        ml_memory_limit=10240,
        tmp_folder=os.getcwd()+'/'+foldername+'_tmp',
        output_folder=os.getcwd()+'/'+foldername,
        delete_tmp_folder_after_terminate=False,
        delete_output_folder_after_terminate=False)

    automl.fit(X_train, 
               y_train,
               dataset_name=jsonfile[0:-5],
               feat_type=feature_types)

    y_predictions = automl.predict(X_test)
    acc= sklearn.metrics.accuracy_score(y_true=y_test,
                                         y_pred=y_predictions)
    print("Accuracy:", acc)

    print('saving classifier to disk')
    f=open(modelname+'.pickle','wb')
    pickle.dump(automl,f)
    f.close()

    data={'sample type': problemtype,
        'training script': 'autosklearn',
        'feature_set':default_features,
        'model name':modelname+'.pickle',
        'accuracy':acc,
        'model type':'sc_'+classifiername,
        }

    g2=open(modelname+'.json','w')
    json.dump(data,g2)
    g2.close()

    cur_dir2=os.getcwd()
    try:
        os.chdir(problemtype+'_models')
    except:
        os.mkdir(problemtype+'_models')
        os.chdir(problemtype+'_models')

    # now move all the files over to proper model directory 
    shutil.move(cur_dir2+'/'+modelname+'.json', os.getcwd()+'/'+modelname+'.json')
    shutil.move(cur_dir2+'/'+modelname+'.pickle', os.getcwd()+'/'+modelname+'.pickle')
Exemple #3
0
    def execute(self, params, **kwargs):
        import autosklearn.classification as automl

        clf = automl.AutoSklearnClassifier(include_preprocessors=[
            "no_preprocessing",
        ],
                                           exclude_preprocessors=None)
        clf.fit(self.marvin_dataset["X_train"], self.marvin_dataset["y_train"])

        self.marvin_model = {
            "clf": clf,
            "vect": self.marvin_dataset["vect"],
            "encoder": self.marvin_dataset["encoder"]
        }
Exemple #4
0
 #
 if alg.name == 'TPOT_Classifier':
     from tpot import TPOTClassifier
     model = TPOTClassifier(
         generations=alg.generations,
         cv=alg.cv,
         scoring=alg.scoring,
         verbosity=alg.verbosity
     )
     warn_not_gpu_support(alg)
 elif alg.name == 'AutoSklearn_Classifier':
     from autosklearn import classification
     if alg.sampling:
         model = classification.AutoSklearnClassifier(
             time_left_for_this_task=alg.task_time,
             per_run_time_limit=alg.run_time,
             resampling_strategy=alg.sampling_strategy,
             resampling_strategy_arguments={'folds': alg.folds}
         )
     else:
         model = classification.AutoSklearnClassifier(
             time_left_for_this_task=alg.task_time,
             per_run_time_limit=alg.run_time
         )
     warn_not_gpu_support(alg)
 elif alg.name == 'SupportVectorMachines':
     if NVIDIA_RAPIDS_ENABLED:
         from cuml.svm import SVC
     else:
         from sklearn.svm import SVC
     model = SVC(**alg.input_variables.__dict__)
 elif alg.name == 'GaussianNaiveBayes':
 # Classification algorithms
 #
 if alg.name == 'TPOT_Classifier':
   from tpot import TPOTClassifier
   model = TPOTClassifier(        
       generations=alg.generations,
       cv=alg.cv,
       scoring=alg.scoring,
       verbosity=alg.verbosity)
 elif alg.name == 'AutoSklearn_Classifier':
   from autosklearn import classification
   if alg.sampling.lower()=='true':
     model = classification.AutoSklearnClassifier(
         time_left_for_this_task=alg.task_time,
         per_run_time_limit=alg.run_time,
         resampling_strategy= "".join(alg.sampling_strategy),
         resampling_strategy_arguments={'folds':int(alg.folds)}
         #feat_type = {Numerical,Numerical,Numerical,Numerical,Categorical}
     )
   else:
       model = classification.AutoSklearnClassifier(
         time_left_for_this_task=alg.task_time,
         per_run_time_limit=alg.run_time
     )
 elif alg.name == 'SupportVectorMachines':
   from sklearn.svm import SVC
   model = SVC(**vars)   
 elif alg.name == 'GaussianNaiveBayes':
   from sklearn.naive_bayes import GaussianNB
   model = GaussianNB(**vars)  
 elif alg.name == 'LogisticRegression':
def main():

    time = 60 * 5

    runs = 5

    option = int(input("1. Imbalanced\n2. SMOTE + ENN\n"))

    for j in range(0, runs):

        file = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")

        file = ut.prune_data(file)

        X_train, X_test, Y_train, Y_test = sk.model_selection.train_test_split(
            file.drop('Churn', axis=1), file['Churn'], test_size=0.3)

        # Choosing kappa as the metric
        kappas = asc_t.metrics.make_scorer('kappa',
                                           sk.metrics.cohen_kappa_score)

        if option == 1:

            automl = asc.AutoSklearnClassifier(
                time_left_for_this_task=time + 10,
                per_run_time_limit=time,
                initial_configurations_via_metalearning=0,
                # resampling_strategy='cv',
                # resampling_strategy_arguments={'folds': 5},
            )

            automl.fit(X_train, Y_train, metric=kappas)

            # automl.refit(X_train, Y_train)

            # Getting the rank of models
            scores.save_log(automl, "log_imbal.txt")

            # Predicting the model
            scores.predict_and_save(automl,
                                    X_test,
                                    Y_test,
                                    verbose=True,
                                    file="prediction.txt")

            # Save the model into binary code
            filename = str(j) + 'imbal_model.sav'
            pickle.dump(automl, open(filename, 'wb'))

        elif option == 2:

            # Applying SMOTE + ENN
            X_resampled, Y_resampled = smoteenn.run(X_train,
                                                    Y_train,
                                                    smote_kind="svm")

            resample = asc.AutoSklearnClassifier(
                time_left_for_this_task=time + 10,
                per_run_time_limit=time,
                initial_configurations_via_metalearning=0)

            resample.fit(X_resampled, Y_resampled, metric=kappas)

            # Getting the rank of models
            scores.save_log(resample, "log_bal.txt")

            scores.predict_and_save(resample,
                                    X_test,
                                    Y_test,
                                    verbose=True,
                                    file="prediction_balanced.txt")

            # Save the model into binary code
            filename = str(j) + 'bal_model.sav'
            pickle.dump(resample, open(filename, 'wb'))

        else:

            print("Invalid Option")
            sys.exit()