class AllStateDataLoaderTest(unittest.TestCase): def setUp(self): self.data_loader = AllStateDataLoader() def testColumnsData2Train(self): self.data_2_train = self.data_loader.get_data_2_train() self.assertTrue("real_A" in self.data_2_train.columns) self.assertFalse("value_A_pt_2" in self.data_2_train.columns) self.assertFalse("value_A_pt_2_0" in self.data_2_train.columns) self.assertFalse("value_A_pt_3_0" in self.data_2_train.columns)
from AllStateDataLoader import AllStateDataLoader from AllStatePredictor import AllStatePredictor from sklearn import linear_model from sklearn import grid_search import numpy as np def score(y_predict, y_real): n = float(y_predict.shape[0]) n_ok = float(np.sum(y_predict == y_real)) return (n_ok/n) l = AllStateDataLoader() p = AllStatePredictor() # X_2 = l.get_X_train("2", "") y_2 = l.get_y("2", "ABCDEFG") y_2_predict = p.predict_cascade("2", "extratrees", "ABCDEFG", kind="train") # X_3 = l.get_X_train("3", "") y_3 = l.get_y("3", "ABCDEFG") y_3_predict = p.predict_cascade("3", "extratrees", "ABCDEFG", kind="train") # X_4 = l.get_X_train("4", "") y_4 = l.get_y("4", "ABCDEFG") y_4_predict = p.predict_cascade("4", "extratrees", "ABCDEFG", kind="train") # X_all = l.get_X_train("all", "")
model = grid_search.GridSearchCV(log, parameters, verbose=verbose) model.fit(X, y) joblib.dump(model, filename) return model # fitting models parameters = { 'C': [0.1, 0.5, 1.0], 'loss': ['l2'], 'penalty': ['l1', 'l2'], 'dual': [False] } l = AllStateDataLoader() def get_model_filename(type_dataset, objective_letter, real_letters): if real_letters == "": return os.path.join( "model_linearsvc", "model_linearsvc_data_%s_%s_without_real_cascade.pkl" % (type_dataset, objective_letter)) else: return os.path.join( "model_linearsvc", "model_linearsvc_data_%s_%s_with_real_%s_cascade.pkl" % (type_dataset, objective_letter, real_letters))
return np.array(np.where(tmp["real_%s" % letter] == value, 1, 0)) def get_y(letter, data): tmp = data.copy() return np.array(tmp["real_%s" % letter]) from sklearn import linear_model from sklearn.externals import joblib from sklearn import grid_search l = AllStateDataLoader() print("Extraction data_2...") data_2 = l.get_data_2_train() print("Extraction data_3...") data_3 = l.get_data_3_train() print("Extraction data_all...") data_all = l.get_data_all_train() def fit_and_save_log(parameters, dataset, letter, filename, verbose=2): log = linear_model.LogisticRegression() X = get_X_without_scaler(dataset) y = get_y(letter, dataset) model = grid_search.GridSearchCV(log, parameters, verbose=verbose)
import sys sys.path.append("lib") from AllStateDataLoader import AllStateDataLoader from sklearn import linear_model from sklearn import grid_search import numpy as np l = AllStateDataLoader() # Model C sans rien X_all = l.get_X_train("all", "") y_all = l.get_y("all", "C") parameters = {'penalty' : ['l2'], 'C' : np.logspace(-3, 0, 3)} model_C = grid_search.GridSearchCV( linear_model.LogisticRegression(), parameters, verbose=2 ) model_D.fit(np.array(X_all), np.array(y_all)) # Model D sans rien X_all = l.get_X_train("all", "") y_all = l.get_y("all", "D") parameters = {'penalty' : ['l2'], 'C' : np.logspace(-3, 0, 3)} model_D = grid_search.GridSearchCV( linear_model.LogisticRegression(), parameters, verbose=2
import os import sys import pandas as pd from sklearn.externals import joblib sys.path.append("lib") from AllStateDataLoader import AllStateDataLoader l = AllStateDataLoader() data_train_all = l.get_data_all_train() data_train_all_np = l.get_X_without_scaler(data_train_all) def predict_AB(data, letter_1, letter_2): model_name = os.path.join("model_logistic", "model_logistic_data_all_%s%s_not_centered.pkl" % (letter_1, letter_2)) model = joblib.load(model_name) list_classes = model.best_estimator_.classes_ prediction = model.predict_proba(data) prediction_cumsum = np.cumsum(prediction, axis=1) prediction_classes = np.apply_along_axis( lambda x : np.searchsorted(x, np.random.uniform()), axis=1, arr=prediction_cumsum ) prediction_real_classes = list_classes[prediction_classes] return prediction_real_classes
tmp = data.copy() return np.array(np.where(tmp["real_%s" % letter] == value, 1, 0)) def get_y(letter, data): tmp = data.copy() return np.array(tmp["real_%s" % letter]) from sklearn import svm from sklearn.externals import joblib from sklearn import grid_search l = AllStateDataLoader() print("Extraction data_2...") data_2 = l.get_data_2_train(with_location_view=True) print("Extraction data_3...") data_3 = l.get_data_3_train(with_location_view=True) print("Extraction data_4...") data_4 = l.get_data_4_train(with_location_view=True) print("Extraction data_all...") data_all = l.get_data_all_train(with_location_view=True) def fit_and_save_log(parameters, dataset, letter, filename,verbose=2): log = svm.LinearSVC(class_weight="auto") X = get_X_without_scaler(dataset) y = get_y(letter, dataset)
def setUp(self): self.data_loader = AllStateDataLoader()
def __init__(self): self.__datasets = {} self.__dataloader = AllStateDataLoader() self.debug = True
from AllStateDataLoader import AllStateDataLoader from AllStatePredictor import AllStatePredictor from sklearn import linear_model from sklearn import grid_search import numpy as np def score(y_predict, y_real): n = float(y_predict.shape[0]) n_ok = float(np.sum(y_predict == y_real)) return (n_ok / n) l = AllStateDataLoader() p = AllStatePredictor() # # X_2 = l.get_X_train("2", "") # y_2 = l.get_y("2", "ABCDEFG") # y_2_predict = p.predict_simple("2", "logistic", "ABCDEFG", kind="train") # # X_3 = l.get_X_train("3", "") # y_3 = l.get_y("3", "ABCDEFG") # y_3_predict = p.predict_simple("3", "logistic", "ABCDEFG", kind="train") # # X_all = l.get_X_train("all", "") # y_all = l.get_y("all", "ABCDEFG") # y_all_predict = p.predict_simple("all", "logistic", "ABCDEFG", kind="train") # print "score 2 logistic : %.4f" % (score(y_2, y_2_predict))
class AllStatePredictor(): """Object de prediction""" def __init__(self): self.__datasets = {} self.__dataloader = AllStateDataLoader() self.debug = True def __get_dataset(self, type_dataset, kind="test"): """Recuperation du dataset (lazy)""" if type_dataset == "2": if not self.__datasets.has_key("2"): if kind == "test": self.__datasets["2"] = self.__dataloader.get_data_2_test() else: self.__datasets["2"] = self.__dataloader.get_X_train( "2", "") return self.__datasets["2"] elif type_dataset == "3": if not self.__datasets.has_key("3"): if kind == "test": self.__datasets["3"] = self.__dataloader.get_data_3_test() else: self.__datasets["3"] = self.__dataloader.get_X_train( "3", "") return self.__datasets["3"] elif type_dataset == "4": if not self.__datasets.has_key("4"): if kind == "test": self.__datasets["4"] = self.__dataloader.get_data_4_test() else: self.__datasets["4"] = self.__dataloader.get_X_train( "4", "") return self.__datasets["4"] elif type_dataset == "all": if not self.__datasets.has_key("all"): if kind == "test": self.__datasets[ "all"] = self.__dataloader.get_data_all_test() else: self.__datasets["all"] = self.__dataloader.get_X_train( "all", "") return self.__datasets["all"] def get_X_columns(self, type_dataset): """Recuperation de la liste des colonnes d'un dataset particulier""" dataset = self.__get_dataset(type_dataset) return [ x for x in dataset.columns if x not in [ "real_%s" % letter for letter in ['A', 'B', 'C', 'D', 'E', 'F', 'G'] ] ] def get_X(self, type_dataset, kind="test"): """Recuperation de X""" dataset = self.__get_dataset(type_dataset, kind=kind) tmp = dataset.copy() for variable in [ "real_%s" % x for x in ['A', 'B', 'C', 'D', 'E', 'F', 'G'] if "real_%s" % x in self.get_X_columns(type_dataset) ]: del tmp[variable] tmp = tmp.reindex(columns=sorted(list(tmp.columns))) return tmp def get_customer_ID_list(self, type_dataset): """Recuperation de la liste des customer_ID""" dataset = self.__get_dataset(type_dataset) tmp = dataset.copy() return np.array(tmp.index) def get_model_filename(self, letter, type_prediction, centered_or_not, type_dataset): """Retourne le nom du fichier modele""" return (os.path.join( "model_%s" % type_prediction, "model_%s_data_%s_%s_%s.pkl" % (type_prediction, type_dataset, letter, centered_or_not))) def __get_model(self, letter, type_prediction, centered_or_not, type_dataset): filename = self.get_model_filename(letter, type_prediction, centered_or_not, type_dataset) model = joblib.load(filename) return model def predict(self, letter, type_prediction, centered_or_not, type_dataset): """Fonction prediction""" X = self.get_X(type_dataset) model = self.__get_model(letter, type_prediction, centered_or_not, type_dataset) return (model.predict(X), model.predict_proba(X), model) def get_model(self, letter, type_prediction, centered_or_not, type_dataset): """Recuperation modele""" model = self.__get_model(letter, type_prediction, centered_or_not, type_dataset) return model def predict_simple(self, type_data, type_model, letter, kind="test"): """prediction""" def concat_ABCDEFG(x): return "%d%d%d%d%d%d%d" % (x['real_A'], x['real_B'], x['real_C'], x['real_D'], x['real_E'], x['real_F'], x['real_G']) data = self.__get_dataset(type_data, kind=kind) tmp = data.copy() if letter == "ABCDEFG": for letter_unique in letter: model_filename = os.path.join( "model_%s" % type_model, "model_%s_data_%s_%s_not_centered.pkl" % (type_model, type_data, letter_unique)) model = joblib.load(model_filename) tmp["real_%s" % letter_unique] = model.predict(data) return tmp.apply(concat_ABCDEFG, axis=1) else: model_filename = os.path.join( "model_%s" % type_model, "model_%s_data_%s_%s_not_centered.pkl" % (type_model, type_data, letter)) model = joblib.load(model_filename) return model.predict(data) def __get_cascade_model_filename(self, type_dataset, objective_letter, real_letters): if real_letters == "": return os.path.join( "model_linearsvc", "model_linearsvc_data_%s_%s_without_real_cascade.pkl" % (type_dataset, objective_letter)) else: return os.path.join( "model_linearsvc", "model_linearsvc_data_%s_%s_with_real_%s_cascade.pkl" % (type_dataset, objective_letter, real_letters)) def predict_cascade(self, type_data, type_model, letter, kind="test"): """prediction""" def concat_ABCDEFG(x): return "%d%d%d%d%d%d%d" % (x['real_A'], x['real_B'], x['real_C'], x['real_D'], x['real_E'], x['real_F'], x['real_G']) data = self.get_X(type_data, kind=kind) tmp_final = data.copy() # D tmp = data.copy() model_D_filename = self.__get_cascade_model_filename( type_data, "D", "") model_D = joblib.load(model_D_filename) tmp_final["real_D"] = model_D.predict(tmp) # C avec info D tmp = data.copy() tmp["real_D"] = tmp_final["real_D"].copy() tmp_bis = pd.DataFrame(pd.get_dummies(tmp["real_D"], prefix="real_D"), index=tmp.index) tmp = pd.merge(tmp, tmp_bis, left_index=True, right_index=True) del tmp["real_D"] tmp = tmp.reindex(columns=sorted(list(tmp.columns))) model_C_with_D_filename = self.__get_cascade_model_filename( type_data, "C", "D") model_C_with_D = joblib.load(model_C_with_D_filename) tmp_final["real_C"] = model_C_with_D.predict(tmp) # E tmp = data.copy() model_E_filename = self.__get_cascade_model_filename( type_data, "E", "") model_E = joblib.load(model_E_filename) tmp_final["real_E"] = model_E.predict(tmp) # B avec info E tmp = data.copy() tmp["real_E"] = tmp_final["real_E"].copy() tmp_bis = pd.DataFrame(pd.get_dummies(tmp["real_E"], prefix="real_E"), index=tmp.index) tmp = pd.merge(tmp, tmp_bis, left_index=True, right_index=True) del tmp["real_E"] tmp = tmp.reindex(columns=sorted(list(tmp.columns))) model_B_with_E_filename = self.__get_cascade_model_filename( type_data, "B", "E") model_B_with_E = joblib.load(model_B_with_E_filename) tmp_final["real_B"] = model_B_with_E.predict(tmp) # F avec info E tmp = data.copy() tmp["real_E"] = tmp_final["real_E"].copy() tmp_bis = pd.DataFrame(pd.get_dummies(tmp["real_E"], prefix="real_E"), index=tmp.index) tmp = pd.merge(tmp, tmp_bis, left_index=True, right_index=True) del tmp["real_E"] tmp = tmp.reindex(columns=sorted(list(tmp.columns))) model_F_with_E_filename = self.__get_cascade_model_filename( type_data, "F", "E") model_F_with_E = joblib.load(model_F_with_E_filename) tmp_final["real_F"] = model_F_with_E.predict(tmp) # A avec info EF tmp = data.copy() tmp["real_E"] = tmp_final["real_E"].copy() tmp_bis = pd.DataFrame(pd.get_dummies(tmp["real_E"], prefix="real_E"), index=tmp.index) tmp = pd.merge(tmp, tmp_bis, left_index=True, right_index=True) del tmp["real_E"] tmp["real_F"] = tmp_final["real_F"].copy() tmp_bis = pd.DataFrame(pd.get_dummies(tmp["real_F"], prefix="real_F"), index=tmp.index) tmp = pd.merge(tmp, tmp_bis, left_index=True, right_index=True) del tmp["real_F"] tmp = tmp.reindex(columns=sorted(list(tmp.columns))) model_A_with_EF_filename = self.__get_cascade_model_filename( type_data, "A", "EF") model_A_with_EF = joblib.load(model_A_with_EF_filename) tmp_final["real_A"] = model_A_with_EF.predict(tmp) # G avec info A tmp = data.copy() tmp["real_A"] = tmp_final["real_A"].copy() tmp_bis = pd.DataFrame(pd.get_dummies(tmp["real_A"], prefix="real_A"), index=tmp.index) tmp = pd.merge(tmp, tmp_bis, left_index=True, right_index=True) del tmp["real_A"] tmp = tmp.reindex(columns=sorted(list(tmp.columns))) model_G_with_A_filename = self.__get_cascade_model_filename( type_data, "G", "A") model_G_with_A = joblib.load(model_G_with_A_filename) tmp_final["real_G"] = model_G_with_A.predict(tmp) return tmp_final.apply(concat_ABCDEFG, axis=1)
import sys sys.path.append("lib") from AllStateDataLoader import AllStateDataLoader from sklearn import linear_model from sklearn import grid_search import numpy as np l = AllStateDataLoader() # Model C sans rien X_all = l.get_X_train("all", "") y_all = l.get_y("all", "C") parameters = {'penalty': ['l2'], 'C': np.logspace(-3, 0, 3)} model_C = grid_search.GridSearchCV(linear_model.LogisticRegression(), parameters, verbose=2) model_D.fit(np.array(X_all), np.array(y_all)) # Model D sans rien X_all = l.get_X_train("all", "") y_all = l.get_y("all", "D") parameters = {'penalty': ['l2'], 'C': np.logspace(-3, 0, 3)} model_D = grid_search.GridSearchCV(linear_model.LogisticRegression(), parameters, verbose=2) model_D.fit(np.array(X_all), np.array(y_all)) # Model C avec D
tmp = data.copy() return np.array(np.where(tmp["real_%s" % letter] == value, 1, 0)) def get_y(letter, data): tmp = data.copy() return np.array(tmp["real_%s" % letter]) from sklearn import svm from sklearn.externals import joblib from sklearn import grid_search l = AllStateDataLoader() print("Extraction data_2...") data_2 = l.get_data_2_train() print("Extraction data_3...") data_3 = l.get_data_3_train() print("Extraction data_4...") data_4 = l.get_data_4_train() print("Extraction data_all...") data_all = l.get_data_all_train() def fit_and_save_log(parameters, dataset, letter, filename,verbose=2): log = svm.LinearSVC(class_weight="auto") X = get_X_without_scaler(dataset) y = get_y(letter, dataset)
from sklearn import grid_search def fit_and_save_log(parameters, X, y , filename, verbose=2): log = svm.LinearSVC(class_weight="auto") model = grid_search.GridSearchCV(log, parameters, verbose=verbose) model.fit(X,y) joblib.dump(model, filename) return model # fitting models parameters = {'C' : [0.1, 0.5, 1.0], 'loss' : ['l2'], 'penalty' : ['l1','l2'], 'dual' : [False]} l = AllStateDataLoader() def get_model_filename(type_dataset, objective_letter, real_letters): if real_letters == "": return os.path.join("model_linearsvc", "model_linearsvc_data_%s_%s_without_real_cascade_with_location_view.pkl" % (type_dataset, objective_letter)) else: return os.path.join("model_linearsvc", "model_linearsvc_data_%s_%s_with_real_%s_cascade_with_location_view.pkl" % (type_dataset, objective_letter, real_letters)) for datasetname in ["2", "3", "4", "all"]: # Model D sans rien model_filename = get_model_filename(datasetname, "D", "") if not os.path.exists(model_filename): print("Calcul model %s sur dataset %s (%s)" % ("D", datasetname, model_filename)) X = l.get_X_train(datasetname, "") y = l.get_y(datasetname, "D")
class AllStatePredictor(): """Object de prediction""" def __init__(self): self.__datasets = {} self.__dataloader = AllStateDataLoader() self.debug = True def __get_dataset(self, type_dataset, kind="test"): """Recuperation du dataset (lazy)""" if type_dataset == "2": if not self.__datasets.has_key("2"): if kind == "test": self.__datasets["2"] = self.__dataloader.get_data_2_test() else: self.__datasets["2"] = self.__dataloader.get_X_train("2", "") return self.__datasets["2"] elif type_dataset == "3": if not self.__datasets.has_key("3"): if kind == "test": self.__datasets["3"] = self.__dataloader.get_data_3_test() else: self.__datasets["3"] = self.__dataloader.get_X_train("3", "") return self.__datasets["3"] elif type_dataset == "4": if not self.__datasets.has_key("4"): if kind == "test": self.__datasets["4"] = self.__dataloader.get_data_4_test() else: self.__datasets["4"] = self.__dataloader.get_X_train("4", "") return self.__datasets["4"] elif type_dataset == "all": if not self.__datasets.has_key("all"): if kind == "test": self.__datasets["all"] = self.__dataloader.get_data_all_test() else: self.__datasets["all"] = self.__dataloader.get_X_train("all", "") return self.__datasets["all"] def get_X_columns(self, type_dataset): """Recuperation de la liste des colonnes d'un dataset particulier""" dataset = self.__get_dataset(type_dataset) return [x for x in dataset.columns if x not in ["real_%s" % letter for letter in ['A','B','C','D','E','F','G']]] def get_X(self, type_dataset, kind="test"): """Recuperation de X""" dataset = self.__get_dataset(type_dataset, kind=kind) tmp = dataset.copy() for variable in ["real_%s" % x for x in ['A','B','C','D','E','F','G'] if "real_%s" % x in self.get_X_columns(type_dataset)]: del tmp[variable] tmp = tmp.reindex(columns=sorted(list(tmp.columns))) return tmp def get_customer_ID_list(self, type_dataset): """Recuperation de la liste des customer_ID""" dataset = self.__get_dataset(type_dataset) tmp = dataset.copy() return np.array(tmp.index) def get_model_filename(self, letter, type_prediction, centered_or_not, type_dataset): """Retourne le nom du fichier modele""" return(os.path.join("model_%s" % type_prediction, "model_%s_data_%s_%s_%s.pkl" % (type_prediction, type_dataset, letter, centered_or_not))) def __get_model(self, letter, type_prediction, centered_or_not, type_dataset): filename = self.get_model_filename(letter, type_prediction, centered_or_not, type_dataset) model = joblib.load(filename) return model def predict(self, letter, type_prediction, centered_or_not, type_dataset): """Fonction prediction""" X = self.get_X(type_dataset) model = self.__get_model(letter, type_prediction, centered_or_not, type_dataset) return(model.predict(X), model.predict_proba(X), model) def get_model(self, letter, type_prediction, centered_or_not, type_dataset): """Recuperation modele""" model = self.__get_model(letter, type_prediction, centered_or_not, type_dataset) return model def predict_simple(self, type_data, type_model, letter, kind="test"): """prediction""" def concat_ABCDEFG(x): return "%d%d%d%d%d%d%d" % (x['real_A'], x['real_B'], x['real_C'], x['real_D'], x['real_E'], x['real_F'], x['real_G']) data = self.__get_dataset(type_data, kind=kind) tmp = data.copy() if letter == "ABCDEFG": for letter_unique in letter: model_filename = os.path.join("model_%s" % type_model, "model_%s_data_%s_%s_not_centered.pkl" % (type_model, type_data, letter_unique)) model = joblib.load(model_filename) tmp["real_%s" % letter_unique] = model.predict(data) return tmp.apply(concat_ABCDEFG, axis=1) else: model_filename = os.path.join("model_%s" % type_model, "model_%s_data_%s_%s_not_centered.pkl" % (type_model, type_data, letter)) model = joblib.load(model_filename) return model.predict(data) def __get_cascade_model_filename(self, type_dataset, objective_letter, real_letters): if real_letters == "": return os.path.join("model_linearsvc", "model_linearsvc_data_%s_%s_without_real_cascade.pkl" % (type_dataset, objective_letter)) else: return os.path.join("model_linearsvc", "model_linearsvc_data_%s_%s_with_real_%s_cascade.pkl" % (type_dataset, objective_letter, real_letters)) def predict_cascade(self, type_data, type_model, letter, kind="test"): """prediction""" def concat_ABCDEFG(x): return "%d%d%d%d%d%d%d" % (x['real_A'], x['real_B'], x['real_C'], x['real_D'], x['real_E'], x['real_F'], x['real_G']) data = self.get_X(type_data, kind=kind) tmp_final = data.copy() # D tmp = data.copy() model_D_filename = self.__get_cascade_model_filename(type_data, "D", "") model_D = joblib.load(model_D_filename) tmp_final["real_D"] = model_D.predict(tmp) # C avec info D tmp = data.copy() tmp["real_D"] = tmp_final["real_D"].copy() tmp_bis = pd.DataFrame(pd.get_dummies(tmp["real_D"], prefix="real_D"), index=tmp.index) tmp = pd.merge(tmp, tmp_bis, left_index=True, right_index=True) del tmp["real_D"] tmp = tmp.reindex(columns=sorted(list(tmp.columns))) model_C_with_D_filename = self.__get_cascade_model_filename(type_data, "C", "D") model_C_with_D = joblib.load(model_C_with_D_filename) tmp_final["real_C"] = model_C_with_D.predict(tmp) # E tmp = data.copy() model_E_filename = self.__get_cascade_model_filename(type_data, "E", "") model_E = joblib.load(model_E_filename) tmp_final["real_E"] = model_E.predict(tmp) # B avec info E tmp = data.copy() tmp["real_E"] = tmp_final["real_E"].copy() tmp_bis = pd.DataFrame(pd.get_dummies(tmp["real_E"], prefix="real_E"), index=tmp.index) tmp = pd.merge(tmp, tmp_bis, left_index=True, right_index=True) del tmp["real_E"] tmp = tmp.reindex(columns=sorted(list(tmp.columns))) model_B_with_E_filename = self.__get_cascade_model_filename(type_data, "B", "E") model_B_with_E = joblib.load(model_B_with_E_filename) tmp_final["real_B"] = model_B_with_E.predict(tmp) # F avec info E tmp = data.copy() tmp["real_E"] = tmp_final["real_E"].copy() tmp_bis = pd.DataFrame(pd.get_dummies(tmp["real_E"], prefix="real_E"), index=tmp.index) tmp = pd.merge(tmp, tmp_bis, left_index=True, right_index=True) del tmp["real_E"] tmp = tmp.reindex(columns=sorted(list(tmp.columns))) model_F_with_E_filename = self.__get_cascade_model_filename(type_data, "F", "E") model_F_with_E = joblib.load(model_F_with_E_filename) tmp_final["real_F"] = model_F_with_E.predict(tmp) # A avec info EF tmp = data.copy() tmp["real_E"] = tmp_final["real_E"].copy() tmp_bis = pd.DataFrame(pd.get_dummies(tmp["real_E"], prefix="real_E"), index=tmp.index) tmp = pd.merge(tmp, tmp_bis, left_index=True, right_index=True) del tmp["real_E"] tmp["real_F"] = tmp_final["real_F"].copy() tmp_bis = pd.DataFrame(pd.get_dummies(tmp["real_F"], prefix="real_F"), index=tmp.index) tmp = pd.merge(tmp, tmp_bis, left_index=True, right_index=True) del tmp["real_F"] tmp = tmp.reindex(columns=sorted(list(tmp.columns))) model_A_with_EF_filename = self.__get_cascade_model_filename(type_data, "A", "EF") model_A_with_EF = joblib.load(model_A_with_EF_filename) tmp_final["real_A"] = model_A_with_EF.predict(tmp) # G avec info A tmp = data.copy() tmp["real_A"] = tmp_final["real_A"].copy() tmp_bis = pd.DataFrame(pd.get_dummies(tmp["real_A"], prefix="real_A"), index=tmp.index) tmp = pd.merge(tmp, tmp_bis, left_index=True, right_index=True) del tmp["real_A"] tmp = tmp.reindex(columns=sorted(list(tmp.columns))) model_G_with_A_filename = self.__get_cascade_model_filename(type_data, "G", "A") model_G_with_A = joblib.load(model_G_with_A_filename) tmp_final["real_G"] = model_G_with_A.predict(tmp) return tmp_final.apply(concat_ABCDEFG, axis=1)