def lazy_cls(X, y, output_csv=False): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) clf = LazyClassifier() models, predictions = clf.fit(X_train, X_test, y_train, y_test) if output_csv: models.to_csv('data/lazy_cls.csv') print(models)
print(x_test.shape, y_test.shape) """# **8. Apply Lazypredict** As the dataset is too big when we apply <code>LazyClassifier</code> algorithm our execution may crash due to less RAM. Google colab provides 12gb RAM for free but to execute this algorithm with big dataset we need more RAM. So i'm using Google colab pro to execute this algorithm. Don't worry if you have not Colab pro. I'll provide the output in a csv file. """ !pip install lazypredict==0.2.7 !pip install lightgbm import lazypredict from lazypredict.Supervised import LazyClassifier clf= LazyClassifier(verbose=0,ignore_warnings=True,custom_metric=None) train,test=clf.fit(x_train,x_test,y_train,y_test) train """NOTE: If the execution got failed or crashed then run the below cell to see the output of upper cells and make comment the above five cells""" # all_algorithm_df=pd.read_csv("lazypredict_algo.csv") # all_algorithm_df """# **9. Hyperparameter Tuning**""" # this code is to show how much time required to train the model using different algorithms def timer(start_time= None): if not start_time: start_time=datetime.now() return start_time
import numpy as np import lazypredict import joblib from lazypredict.Supervised import LazyClassifier from sklearn.datasets import load_files, load_breast_cancer from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, classification_report from sklearn.naive_bayes import MultinomialNB from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.pipeline import Pipeline from sklearn.svm import SVC DATA_DIR = "/home/ila/Documents/repos/python-works/artificialintelligence/machine_learning/doc_classification/classifydata/dataset_5classes/" # DATA_DIR = "/home/ila/Documents/900_docs/ocr_text/" # data = load_files(DATA_DIR, encoding="utf-8", decode_error="replace") data = load_breast_cancer() X = data.data y = data.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=123) clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None) models, predictions = clf.fit(X_train, X_test, y_train, y_test) print(models) models
X_full = df.drop('good_cond', axis=1) y_full = df['good_cond'] # Perform lazy classifier once to get the list of all models: # In[ ]: # Splitting X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25) # Using LazyClassifier for cut dataset clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None) models, predictions = clf.fit(X_train, X_test, y_train, y_test) modellist = list(models.index.values) # Get the list of the methods' names models # Perform replications with 75% data as the training set. The R2 scores of the model are recorded. The models are also ranked according to the R2 scores. These scores and rank are then averaged. # In[ ]: Nrep = 1000 # Number of replications, the higher the better r2score = np.zeros((len(modellist),Nrep)) # Initialize the r2score position = np.zeros((len(modellist),Nrep)) # Initialize the position (rank) for LOOP in range(0,Nrep): #Splitting
def main(): output_dir = os.path.dirname(__file__) experiments = [ # "A_May24_11_08_ela_skresnext50_32x4d_fold0_fp16", # "A_May15_17_03_ela_skresnext50_32x4d_fold1_fp16", # "A_May21_13_28_ela_skresnext50_32x4d_fold2_fp16", # "A_May26_12_58_ela_skresnext50_32x4d_fold3_fp16", # # "B_Jun05_08_49_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16", # "B_Jun09_16_38_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16", # "B_Jun11_08_51_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16", # "B_Jun11_18_38_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16", # # "C_Jun24_22_00_rgb_tf_efficientnet_b2_ns_fold2_local_rank_0_fp16", # # "D_Jun18_16_07_rgb_tf_efficientnet_b7_ns_fold1_local_rank_0_fp16", # "D_Jun20_09_52_rgb_tf_efficientnet_b7_ns_fold2_local_rank_0_fp16", # # "E_Jun18_19_24_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16", # "E_Jun21_10_48_rgb_tf_efficientnet_b6_ns_fold0_istego100k_local_rank_0_fp16", # # "F_Jun29_19_43_rgb_tf_efficientnet_b3_ns_fold0_local_rank_0_fp16", # "G_Jul03_21_14_nr_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16", "G_Jul05_00_24_nr_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16", "G_Jul06_03_39_nr_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16", "G_Jul07_06_38_nr_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16", # "H_Jul11_16_37_nr_rgb_tf_efficientnet_b7_ns_mish_fold2_local_rank_0_fp16", "H_Jul12_18_42_nr_rgb_tf_efficientnet_b7_ns_mish_fold1_local_rank_0_fp16", ] holdout_predictions = get_predictions_csv(experiments, "cauc", "holdout", "d4") test_predictions = get_predictions_csv(experiments, "cauc", "test", "d4") checksum = compute_checksum_v2(experiments) holdout_ds = get_holdout("", features=[INPUT_IMAGE_KEY]) image_ids = [fs.id_from_fname(x) for x in holdout_ds.images] quality_h = F.one_hot(torch.tensor(holdout_ds.quality).long(), 3).numpy().astype(np.float32) test_ds = get_test_dataset("", features=[INPUT_IMAGE_KEY]) quality_t = F.one_hot(torch.tensor(test_ds.quality).long(), 3).numpy().astype(np.float32) x, y = get_x_y_for_stacking(holdout_predictions, with_logits=True, tta_logits=True) print(x.shape, y.shape) x_test, _ = get_x_y_for_stacking(test_predictions, with_logits=True, tta_logits=True) print(x_test.shape) if True: x = np.column_stack([x, quality_h]) x_test = np.column_stack([x_test, quality_t]) group_kfold = GroupKFold(n_splits=5) for fold_index, (train_index, valid_index) in enumerate( group_kfold.split(x, y, groups=image_ids)): x_train, x_valid, y_train, y_valid = (x[train_index], x[valid_index], y[train_index], y[valid_index]) clf = LazyClassifier(verbose=True, ignore_warnings=False, custom_metric=alaska_weighted_auc, predictions=True) models, predictions = clf.fit(x_train, x_valid, y_train, y_valid) print(models) models.to_csv( os.path.join(output_dir, f"lazypredict_models_{fold_index}_{checksum}.csv")) predictions.to_csv( os.path.join(output_dir, f"lazypredict_preds_{fold_index}_{checksum}.csv"))
data = pd.read_csv(r'D:\Datasets\winequality-red.csv') #print(data.head()) threshold = 5 data['quality'] = np.where(data['quality']>threshold,1,0) #print(data.quality.value_counts()) x = data.drop('quality',axis=1) y = data['quality'] from sklearn.model_selection import train_test_split x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25,random_state=75) from lazypredict.Supervised import LazyClassifier lpc = LazyClassifier() models,predictions = lpc.fit(x_train,x_test,y_train,y_test) print(models) from sklearn.ensemble import RandomForestClassifier rfc = RandomForestClassifier() rfc.fit(x_train,y_train) y_pred = rfc.predict(x_test) print("Without Hyperparamter Tuning :- ") from sklearn import metrics print("Accuracy Score :- ",metrics.accuracy_score(y_test,y_pred)) print("Confusion Matrix :- ",metrics.confusion_matrix(y_test,y_pred)) print("Classification Report :- ",metrics.classification_report(y_test,y_pred))
def build_model(df): l = len(df) #df = df.iloc[:100] X = df.iloc[:, : -1] # Using all column except for the last column as X Y = df.iloc[:, -1] # Selecting the last column as Y st.markdown('**1.2. Dataset dimension**') st.write('X (Independent Axis)') st.info(X.shape) st.write('Y (Dependent Axis)') st.info(Y.shape) st.markdown('**1.3. Variable details**:') st.write('X variable (first few are shown)') st.info(list(X.columns[:int(l / 5)])) st.write('Y variable') st.info(Y.name) # Build lazy model X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=split_size, random_state=seed_number) clf = LazyClassifier(verbose=0, ignore_warnings=False, custom_metric=None) models_train, predictions_train = clf.fit(X_train, X_train, Y_train, Y_train) models_test, predictions_test = clf.fit(X_train, X_test, Y_train, Y_test) st.subheader('2.Model Performance Plot (Training Set)') st.write('Training set') st.write(predictions_train) st.markdown(filedownload(predictions_train, 'training.csv'), unsafe_allow_html=True) st.write('Test set') st.write(predictions_test) st.markdown(filedownload(predictions_test, 'test.csv'), unsafe_allow_html=True) st.subheader('3.Model Performance Plot(Test set)') with st.markdown('**Accuracy**'): # Tall predictions_test["Accuracy"] = [ 0 if i < 0 else i for i in predictions_test["Accuracy"] ] plt.figure(figsize=(5, 12)) sns.set_theme(style="darkgrid") ax1 = sns.barplot(y=predictions_test.index, x="Accuracy", data=predictions_test) ax1.set(xlim=(0, 1)) st.markdown(imagedownload(plt, 'plot-r2-tall.pdf'), unsafe_allow_html=True) # Wide plt.figure(figsize=(12, 5)) sns.set_theme(style="darkgrid") ax1 = sns.barplot(x=predictions_test.index, y="Accuracy", data=predictions_test) ax1.set(ylim=(0, 1)) plt.xticks(rotation=90) st.pyplot(plt) st.markdown(imagedownload(plt, 'plot-r2-wide.pdf'), unsafe_allow_html=True)
from lazypredict.Supervised import LazyClassifier from sklearn.model_selection import train_test_split from sklearn.datasets import load_breast_cancer data = load_breast_cancer() message = data.data related = data.target # splitting for train and test x_train, x_test, y_train, y_test = train_test_split(message, related, train_size=0.9) # Using lazy predict clf = LazyClassifier(classifiers='all') model, predictions = clf.fit(x_train, x_test, y_train, y_test) print(model)
""" from sklearn.metrics import accuracy_score, classification_report, confusion_matrix """### LazyPredict Method""" pip install lazypredict from lazypredict.Supervised import LazyClassifier from sklearn.model_selection import RandomizedSearchCV print("\n\n Lazy Predicts on non-scaled data") print("===================================== \n") clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None) models,predictions = clf.fit(X_train, X_test, y_train, y_test) models print("\n\n Lazy Predicts on scaled data") print("===================================== \n") clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None) models,predictions = clf.fit(X_train_scaled, X_test_scaled, y_train, y_test) models """### Random Forest Classifier""" from sklearn.ensemble import RandomForestClassifier rfc = RandomForestClassifier() n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]