Exemple #1
0
def lazy_cls(X, y, output_csv=False):
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42)

    clf = LazyClassifier()
    models, predictions = clf.fit(X_train, X_test, y_train, y_test)

    if output_csv:
        models.to_csv('data/lazy_cls.csv')

    print(models)
print(x_test.shape, y_test.shape)

"""# **8. Apply Lazypredict**

As the dataset is too big when we apply <code>LazyClassifier</code> algorithm our execution may crash due to less RAM. Google colab provides 12gb RAM for free but to execute this algorithm with big dataset we need more RAM. So i'm using Google colab pro to execute this algorithm. Don't worry if you have not Colab pro. I'll provide the output in a csv file.
"""

!pip install lazypredict==0.2.7

!pip install lightgbm

import lazypredict
from lazypredict.Supervised import LazyClassifier

clf= LazyClassifier(verbose=0,ignore_warnings=True,custom_metric=None)
train,test=clf.fit(x_train,x_test,y_train,y_test)

train

"""NOTE: If the execution got failed or crashed then run the below cell to see the output of upper cells and make comment the above five cells"""

#  all_algorithm_df=pd.read_csv("lazypredict_algo.csv")
#  all_algorithm_df

"""# **9. Hyperparameter Tuning**"""

# this code is to show how much time required to train the model using different algorithms
def timer(start_time= None):
  if not start_time:
    start_time=datetime.now()
    return start_time
Exemple #3
0
import numpy as np
import lazypredict
import joblib
from lazypredict.Supervised import LazyClassifier
from sklearn.datasets import load_files, load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

DATA_DIR = "/home/ila/Documents/repos/python-works/artificialintelligence/machine_learning/doc_classification/classifydata/dataset_5classes/"
# DATA_DIR = "/home/ila/Documents/900_docs/ocr_text/"

# data = load_files(DATA_DIR, encoding="utf-8", decode_error="replace")
data = load_breast_cancer()
X = data.data
y = data.target
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=.5,
                                                    random_state=123)
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)
print(models)
models
Exemple #4
0
X_full = df.drop('good_cond', axis=1)
y_full = df['good_cond']


# Perform lazy classifier once to get the list of all models:

# In[ ]:


# Splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# Using LazyClassifier for cut dataset
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)
modellist = list(models.index.values) # Get the list of the methods' names
models


# Perform replications with 75% data as the training set. The R2 scores of the model are recorded. The models are also ranked according to the R2 scores. These scores and rank are then averaged.

# In[ ]:


Nrep = 1000 # Number of replications, the higher the better
r2score = np.zeros((len(modellist),Nrep)) # Initialize the r2score
position = np.zeros((len(modellist),Nrep)) # Initialize the position (rank)
for LOOP in range(0,Nrep):

    #Splitting
Exemple #5
0
def main():
    output_dir = os.path.dirname(__file__)

    experiments = [
        # "A_May24_11_08_ela_skresnext50_32x4d_fold0_fp16",
        # "A_May15_17_03_ela_skresnext50_32x4d_fold1_fp16",
        # "A_May21_13_28_ela_skresnext50_32x4d_fold2_fp16",
        # "A_May26_12_58_ela_skresnext50_32x4d_fold3_fp16",
        #
        # "B_Jun05_08_49_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16",
        # "B_Jun09_16_38_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16",
        # "B_Jun11_08_51_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16",
        # "B_Jun11_18_38_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16",
        #
        # "C_Jun24_22_00_rgb_tf_efficientnet_b2_ns_fold2_local_rank_0_fp16",
        #
        # "D_Jun18_16_07_rgb_tf_efficientnet_b7_ns_fold1_local_rank_0_fp16",
        # "D_Jun20_09_52_rgb_tf_efficientnet_b7_ns_fold2_local_rank_0_fp16",
        #
        # "E_Jun18_19_24_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16",
        # "E_Jun21_10_48_rgb_tf_efficientnet_b6_ns_fold0_istego100k_local_rank_0_fp16",
        #
        # "F_Jun29_19_43_rgb_tf_efficientnet_b3_ns_fold0_local_rank_0_fp16",
        #
        "G_Jul03_21_14_nr_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16",
        "G_Jul05_00_24_nr_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16",
        "G_Jul06_03_39_nr_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16",
        "G_Jul07_06_38_nr_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16",
        #
        "H_Jul11_16_37_nr_rgb_tf_efficientnet_b7_ns_mish_fold2_local_rank_0_fp16",
        "H_Jul12_18_42_nr_rgb_tf_efficientnet_b7_ns_mish_fold1_local_rank_0_fp16",
    ]

    holdout_predictions = get_predictions_csv(experiments, "cauc", "holdout",
                                              "d4")
    test_predictions = get_predictions_csv(experiments, "cauc", "test", "d4")
    checksum = compute_checksum_v2(experiments)

    holdout_ds = get_holdout("", features=[INPUT_IMAGE_KEY])
    image_ids = [fs.id_from_fname(x) for x in holdout_ds.images]

    quality_h = F.one_hot(torch.tensor(holdout_ds.quality).long(),
                          3).numpy().astype(np.float32)

    test_ds = get_test_dataset("", features=[INPUT_IMAGE_KEY])
    quality_t = F.one_hot(torch.tensor(test_ds.quality).long(),
                          3).numpy().astype(np.float32)

    x, y = get_x_y_for_stacking(holdout_predictions,
                                with_logits=True,
                                tta_logits=True)
    print(x.shape, y.shape)

    x_test, _ = get_x_y_for_stacking(test_predictions,
                                     with_logits=True,
                                     tta_logits=True)
    print(x_test.shape)

    if True:
        x = np.column_stack([x, quality_h])
        x_test = np.column_stack([x_test, quality_t])

    group_kfold = GroupKFold(n_splits=5)

    for fold_index, (train_index, valid_index) in enumerate(
            group_kfold.split(x, y, groups=image_ids)):
        x_train, x_valid, y_train, y_valid = (x[train_index], x[valid_index],
                                              y[train_index], y[valid_index])

        clf = LazyClassifier(verbose=True,
                             ignore_warnings=False,
                             custom_metric=alaska_weighted_auc,
                             predictions=True)
        models, predictions = clf.fit(x_train, x_valid, y_train, y_valid)
        print(models)

        models.to_csv(
            os.path.join(output_dir,
                         f"lazypredict_models_{fold_index}_{checksum}.csv"))
        predictions.to_csv(
            os.path.join(output_dir,
                         f"lazypredict_preds_{fold_index}_{checksum}.csv"))
Exemple #6
0
data = pd.read_csv(r'D:\Datasets\winequality-red.csv')
#print(data.head())

threshold = 5
data['quality'] = np.where(data['quality']>threshold,1,0)
#print(data.quality.value_counts())

x = data.drop('quality',axis=1)
y = data['quality']

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25,random_state=75)

from lazypredict.Supervised import LazyClassifier
lpc = LazyClassifier()
models,predictions = lpc.fit(x_train,x_test,y_train,y_test)

print(models)

from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(x_train,y_train)

y_pred = rfc.predict(x_test)

print("Without Hyperparamter Tuning :- ")

from sklearn import metrics
print("Accuracy Score :- ",metrics.accuracy_score(y_test,y_pred))
print("Confusion Matrix :- ",metrics.confusion_matrix(y_test,y_pred))
print("Classification Report :- ",metrics.classification_report(y_test,y_pred))
        def build_model(df):
            l = len(df)

            #df = df.iloc[:100]
            X = df.iloc[:, :
                        -1]  # Using all column except for the last column as X
            Y = df.iloc[:, -1]  # Selecting the last column as Y

            st.markdown('**1.2. Dataset dimension**')
            st.write('X (Independent Axis)')
            st.info(X.shape)
            st.write('Y (Dependent Axis)')
            st.info(Y.shape)

            st.markdown('**1.3. Variable details**:')
            st.write('X variable (first few are shown)')
            st.info(list(X.columns[:int(l / 5)]))
            st.write('Y variable')
            st.info(Y.name)

            # Build lazy model
            X_train, X_test, Y_train, Y_test = train_test_split(
                X, Y, test_size=split_size, random_state=seed_number)
            clf = LazyClassifier(verbose=0,
                                 ignore_warnings=False,
                                 custom_metric=None)
            models_train, predictions_train = clf.fit(X_train, X_train,
                                                      Y_train, Y_train)
            models_test, predictions_test = clf.fit(X_train, X_test, Y_train,
                                                    Y_test)

            st.subheader('2.Model Performance Plot (Training Set)')

            st.write('Training set')
            st.write(predictions_train)
            st.markdown(filedownload(predictions_train, 'training.csv'),
                        unsafe_allow_html=True)

            st.write('Test set')
            st.write(predictions_test)
            st.markdown(filedownload(predictions_test, 'test.csv'),
                        unsafe_allow_html=True)

            st.subheader('3.Model Performance Plot(Test set)')

            with st.markdown('**Accuracy**'):
                # Tall
                predictions_test["Accuracy"] = [
                    0 if i < 0 else i for i in predictions_test["Accuracy"]
                ]
                plt.figure(figsize=(5, 12))
                sns.set_theme(style="darkgrid")
                ax1 = sns.barplot(y=predictions_test.index,
                                  x="Accuracy",
                                  data=predictions_test)
                ax1.set(xlim=(0, 1))
            st.markdown(imagedownload(plt, 'plot-r2-tall.pdf'),
                        unsafe_allow_html=True)
            # Wide
            plt.figure(figsize=(12, 5))
            sns.set_theme(style="darkgrid")
            ax1 = sns.barplot(x=predictions_test.index,
                              y="Accuracy",
                              data=predictions_test)
            ax1.set(ylim=(0, 1))
            plt.xticks(rotation=90)
            st.pyplot(plt)
            st.markdown(imagedownload(plt, 'plot-r2-wide.pdf'),
                        unsafe_allow_html=True)
Exemple #8
0
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer

data = load_breast_cancer()
message = data.data
related = data.target

# splitting for train and test
x_train, x_test, y_train, y_test = train_test_split(message,
                                                    related,
                                                    train_size=0.9)

# Using lazy predict
clf = LazyClassifier(classifiers='all')
model, predictions = clf.fit(x_train, x_test, y_train, y_test)
print(model)
"""

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

"""### LazyPredict Method"""

pip install lazypredict

from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import RandomizedSearchCV

print("\n\n Lazy Predicts on non-scaled data")
print("===================================== \n")

clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)
models

print("\n\n Lazy Predicts on scaled data")
print("===================================== \n")

clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train_scaled, X_test_scaled, y_train, y_test)
models

"""### Random Forest Classifier"""

from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()

n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]