def NB(X, y, X_ind, y_ind): """Cross Validation and independent set test for Naive Bayes. Arguments: X (ndarray): Feature data of training and validation set for cross-validation. m X n matrix, m is the No. of samples, n is the No. of fetures y (ndarray): Label data of training and validation set for cross-validation. m-D vector, and m is the No. of samples. X_ind (ndarray): Feature data of independent test set for independent test. It has the similar data structure as X. y_ind (ndarray): Feature data of independent set for for independent test. It has the similar data structure as y out (str): The file path for saving the result data. Returns: cvs (ndarray): cross-validation results. The shape is (m, ), m is the No. of samples. inds (ndarray): independent test results. It has similar data structure as cvs. """ folds = StratifiedKFold(5).split(X, y) cvs = np.zeros(y.shape) inds = np.zeros(y_ind.shape) for i, (trained, valided) in enumerate(folds): model = GaussianNB() model.fit(X[trained], y[trained]) cvs[valided] = model.predict_proba(X[valided])[:, 1] inds += model.predict_proba(X_ind)[:, 1] return cvs, inds / 5
def evaluate(model, dataloader, device): model.eval() probas = [] labels = [] # compute metrics over the dataset with torch.no_grad(): for i, (batch_inputs, batch_labels) in enumerate(tqdm(dataloader)): # move to GPU if available batch_inputs = batch_inputs.to(device) batch_labels = batch_labels.to(device) # shape: (batch_size,) # predict softmax probabilities batch_probas = model.predict_proba( batch_inputs) # shape: (batch_size, 2) # collect predictions probas.append(to_np(batch_probas)) labels.append(to_np(batch_labels)) probas = np.vstack(probas) labels = np.concatenate(labels) # compute all metrics after one epoch metrics = { "loss": log_loss(labels, probas[:, 1]), "accuracy": accuracy_score(labels, probas.argmax(1)), "AUC": roc_auc_score(labels, probas[:, 1]) } return metrics
def SVM(X, y, X_ind, y_ind, is_reg=False): """Cross Validation and independent set test for Support Vector Machine (SVM) Arguments: X (ndarray): Feature data of training and validation set for cross-validation. m X n matrix, m is the No. of samples, n is the No. of fetures y (ndarray): Label data of training and validation set for cross-validation. m-D vector, and m is the No. of samples. X_ind (ndarray): Feature data of independent test set for independent test. It has the similar data structure as X. y_ind (ndarray): Feature data of independent set for for independent test. It has the similar data structure as y out (str): The file path for saving the result data. is_reg (bool, optional): define the model for regression (True) or classification (False) (Default: False) Returns: cvs (ndarray): cross-validation results. The shape is (m, ), m is the No. of samples. inds (ndarray): independent test results. It has similar data structure as cvs. """ if is_reg: folds = KFold(5).split(X) model = SVR() else: folds = StratifiedKFold(5).split(X, y) model = SVC(probability=True) cvs = np.zeros(y.shape) inds = np.zeros(y_ind.shape) gs = GridSearchCV(model, { 'C': 2.0**np.array([-5, 15]), 'gamma': 2.0**np.array([-15, 5]) }, n_jobs=5) gs.fit(X, y) params = gs.best_params_ print(params) for i, (trained, valided) in enumerate(folds): model = SVC(probability=True, C=params['C'], gamma=params['gamma']) model.fit(X[trained], y[trained]) if is_reg: cvs[valided] = model.predict(X[valided]) inds += model.predict(X_ind) else: cvs[valided] = model.predict_proba(X[valided])[:, 1] inds += model.predict_proba(X_ind)[:, 1] return cvs, inds / 5
def task(): year, months = 2019, [10, 11, 12] select = None radius, aperture_size, incident_interval, time_step = '100', '6', '25', '1' config = { 'year': year, 'months': months, 'radius': radius, 'aperture_size': aperture_size, 'incident_interval': incident_interval, 'time_step': time_step, 'select': select } month = months[0] df = pd.read_pickle( f'output/waze/{year}_{month}_{radius}_{aperture_size}_features.pkl') for month in months[1:]: temp_df = pd.read_pickle( f'output/waze/{year}_{month}_{radius}_{aperture_size}_features.pkl' ) df = df.append(temp_df, sort=False) df.fillna(0, inplace=True) # if select is not None: # df = df[-1 * int(select):] if os.path.exists( f'output/{year}_{month}_{radius}_{aperture_size}_predict_proba.pkl' ): x = pd.read_pickle( f'output/{year}_{month}_{radius}_{aperture_size}_predict_proba.pkl' ) else: x = model.predict_proba(df, int(incident_interval), time_step) x = model.extract_features(x, df) x.to_pickle( f'output/{year}_{month}_{radius}_{aperture_size}_predict_proba.pkl' ) incident_df = load_incidents(aperture_size) if os.path.exists(f'output/{year}_{month}_{radius}_{aperture_size}_y.pkl'): y = pd.read_pickle( f'output/{year}_{month}_{radius}_{aperture_size}_y.pkl') else: y = label_mapper(x, incident_df) pd.Series(y).to_pickle( f'output/{year}_{month}_{radius}_{aperture_size}_y.pkl') models = [ 'LogisticRegression', 'DecisionTreeClassifier', 'RandomForestClassifier' ] for m in models: print('model', m) cv_results, y_pred = model.cross_validate(x, y, m) for k, v in config.items(): print(k, ',', v) for k, v in cv_results.items(): print(k, ',', np.average(v)) print()
def score_and_predict(model, X, Y): ''' Given a binary classification model, predict output classification for numpy features `X` and evaluate accuracy against labels `Y`. Labels should be numpy array of 0s and 1s. Returns (accuracy, numpy array of classification probabilities) ''' probs = model.predict_proba(X)[:, 1] clf = probs > .5 accuracy = (np.squeeze(Y) == np.squeeze(clf)).mean() return accuracy, probs
def find_text_in_frame(current_img, baseimgs, modelfile='webapp/model.pickle',proba_threshold = 0.5, debug=False): blobs = [] for baseimg in baseimgs: for (xmin,ymin), blob in img_proc_utils.extract_blobs(current_img-baseimg, img_proc_pipeline = img_proc_utils.pipeline_otsu): proba = model.predict_proba(blob, model=modelfile) if proba >= proba_threshold or debug: blobs.append({'blob': blob, 'left_corner': [xmin,ymin], 'proba': proba}) if len(blobs) > 0 and not debug: return blobs return blobs
def RF(X, y, X_ind, y_ind, is_reg=False): """Cross Validation and independent set test for Random Forest model Arguments: X (ndarray): Feature data of training and validation set for cross-validation. m X n matrix, m is the No. of samples, n is the No. of fetures y (ndarray): Label data of training and validation set for cross-validation. m-D vector, and m is the No. of samples. X_ind (ndarray): Feature data of independent test set for independent test. It has the similar data structure as X. y_ind (ndarray): Feature data of independent set for for independent test. It has the similar data structure as y out (str): The file path for saving the result data. is_reg (bool, optional): define the model for regression (True) or classification (False) (Default: False) Returns: cvs (ndarray): cross-validation results. The shape is (m, ), m is the No. of samples. inds (ndarray): independent test results. It has similar data structure as cvs. """ if is_reg: folds = KFold(5).split(X) alg = RandomForestRegressor else: folds = StratifiedKFold(5).split(X, y) alg = RandomForestClassifier cvs = np.zeros(y.shape) inds = np.zeros(y_ind.shape) for i, (trained, valided) in enumerate(folds): model = alg(n_estimators=500, n_jobs=1) model.fit(X[trained], y[trained]) if is_reg: cvs[valided] = model.predict(X[valided]) inds += model.predict(X_ind) else: cvs[valided] = model.predict_proba(X[valided])[:, 1] inds += model.predict_proba(X_ind)[:, 1] return cvs, inds / 5
def temporal(model, batch, y_preds, num_classes, device): inputs, lengths = batch new_preds = torch.zeros(inputs.shape) losses = torch.zeros(inputs.shape) for i in range(inputs.shape[0]): preinputs = inputs[:i + 1, :] with torch.no_grad(): new_lengths = torch.min(lengths, torch.tensor(i + 1).to(device)) preout = model.predict_proba((preinputs, new_lengths)) new_preds[i, :] = preout.gather(1, y_preds).squeeze() losses[0, :] = new_preds[0, :] - 1.0 / num_classes for i in range(1, inputs.shape[0]): losses[i, :] = new_preds[i, :] - new_preds[i - 1, :] return losses
def temporal_tail(model, batch, y_preds, num_classes, device): inputs, lengths = batch new_preds = torch.zeros(inputs.shape) losses = torch.zeros(inputs.shape) for i in range(inputs.shape[0]): postinputs = inputs[i:, :] with torch.no_grad(): new_lengths = torch.max(lengths - i, torch.tensor(1).to(device)) postout = model.predict_proba((postinputs, new_lengths)) new_preds[i, :] = postout.gather(1, y_preds).squeeze() losses[-1, :] = new_preds[-1, :] - 1.0 / num_classes for i in range(inputs.shape[0] - 1): losses[i, :] = new_preds[i, :] - new_preds[i + 1, :] return losses
def get(self): # use parser and find the user's query args = parser.parse_args() user_query = args['query'] # vectorize the user's query and make a prediction uq_vectorized = model.vectorizer_transform(np.array([user_query])) prediction = model.predict(uq_vectorized) pred_proba = model.predict_proba(uq_vectorized) # round the predict proba value and set to new variable confidence = round(pred_proba[0], 3) # create JSON object output = {'intent': prediction.item(0), 'probability': str(confidence)} return output
def get(self): # use parser and find the user's query args = parser.parse_args() user_query = args['query'] # preprocessing the user's query and make a prediction uq_preprocess = model.numericalImputer_transform(np.array([user_query])) prediction = model.predict(uq_preprocess) pred_proba = model.predict_proba(uq_preprocess) # round the predict proba value and set to new variable confidence = round(pred_proba[0], 3) # create JSON object output = {'prediction': str(prediction), 'probability': confidence} return output
def word_drop(model, batch, y_preds, num_classes, device): inputs = batch[0] losses = torch.zeros(inputs.shape) target = None for i in range(inputs.shape[0]): if target: index, vals = target inputs[i - 1, :] = vals target = (i, torch.clone(inputs[i, :])) inputs[i, :] = 0 with torch.no_grad(): out = model.predict_proba(batch) losses[i, :] = out.gather(1, y_preds).squeeze() if target: index, vals = target inputs[-1, :] = vals return 1. - losses
def stream_frames2(stream, pafy_video = None): base_frame_sec = -1 base_frame = None test = (pafy_video == None) # stream = '/windows/mit/rubakov.mp4' # testing if base_frame < 0: if pafy_video: yield 'event: onstart\ndata: %s\n\n' % json.dumps({'video_length': pafy_video.length, 'video_title': pafy_video.title, # 'video_desc': pafy_video.description, 'video_author': pafy_video.author}) else: yield 'event: onstart\ndata: %s\n\n' % json.dumps({'video_length': 5000}) try: for sec, frame in utils.get_frames_from_stream(stream,5): if int(sec % 20) == 0: yield 'event: onprogress\ndata: %s\n\n' % json.dumps({'sec': int(sec)}) if base_frame_sec < 0: base_frame = frame base_frame_sec = sec continue if test: has_blob = False for (xmin,ymin), blob in img_proc_utils.extract_blobs(frame-base_frame, img_proc_pipeline = img_proc_utils.pipeline2): proba = model.predict_proba(blob, model='webapp/model.pickle') if proba > 0.5: has_blob = True print sec, xmin, ymin,proba yield 'data: %s\n\n' % json.dumps({'img': utils.img_to_base64_bytes(blob), #utils.img_to_base64_bytes(255-np.nan_to_num(abs(blob))), 'sec': int(sec), 'proba': proba, 'left_corner': [xmin,ymin], 'size': blob.shape, 'frame': utils.img_to_base64_bytes(frame) }) base_frame = frame base_frame_sec = sec if test and has_blob: time.sleep(3) except StopIteration: print 'onend!' yield 'event: onend\ndata: end\n\n' raise StopIteration
def get(self): features = self.parser.parse_args() # control all required features are passed. # create pandas dataframe row = args_to_pandas(features) # drop useless features (ids) # to_drop = [] df = row[columns] # clean & transform dataset df = api_feature_transformation(df) # make positive (fraud) probability prediction model, prediction, model_cols = predict_proba(df) # get feature contributions (force to 1 row) contributions = get_feature_contributions(model, df[model_cols]) # return json return { "Prediction": round(prediction, 4), "Impact": contributions, }, 200
def predict(year, month, radius, aperture_size, incident_interval, time_step, select): config = { 'year': year, 'month': month, 'radius': radius, 'aperture_size': aperture_size, 'incident_interval': incident_interval, 'time_step': time_step, 'select': select } df = pd.read_pickle( f'output/waze/{year}_{month}_{radius}_{aperture_size}_features.pkl') if select is not None: df = df[-1 * int(select):] x = model.predict_proba(df, int(incident_interval), time_step) incident_df = load_incidents(aperture_size) y = label_mapper(x, incident_df) print(np.unique(y, return_counts=True)) cv_results = model.cross_validate(x, y) for k, v in config.items(): print(k, v) for k, v in cv_results.items(): print(k, np.average(v))
def step(self, X_test, iteration): ####################################### Hardcoded sequences of actions #################################################### if iteration == 1: # as fast as possible X_train, X_test = make_inference(self.se_extractor, self.X, X_test, self.device, max_len=3, num_eval=1) model = LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=20) model.fit(X_train, self.y.argmax(axis=1)) y_pred = model.predict_proba(X_test) self.predictions.append((1, 'first', y_pred)) self.features['se_3_1'] = (X_train, X_test) return if iteration == 2: # schedule a non-blocking op self.data_id = ray.put((self.X, self.y, X_test, self.cv)) self.X_music = ray.remote( num_gpus=0.25, num_cpus=1, max_calls=1)(extract_musicnn_features).remote(self.data_id) self.predictions.clear() X_tr, X_t = self.features['se_3_1'] model = LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=100) score, y_pred = fit_single(None, model, self.cv, X_tr, self.y, X_t, self.metric) self.predictions.append((score, 'first', y_pred)) module_path = f'{self.root_path}/3rdparty/autospeech19/' self.solution3_2019 = ray.remote( num_gpus=0.25, num_cpus=1, max_calls=1)(top3_2019_kon).remote(module_path, self.out_path_top3_2019, self.n_classes, self.data_id, self.metric) module_path = f'{self.root_path}/3rdparty/AutoSpeech/code_submission' self.solution1_2019 = ray.remote( num_gpus=0.25, num_cpus=1, max_calls=1)(top1_2019_hazza_cheng).remote( module_path, self.out_path_top1_2019, self.n_classes, self.data_id, self.metric) if iteration == 3: X_train, X_test = make_inference(self.se_extractor, self.X, X_test, self.device, max_len=5, num_eval=5) self.features['se_5_5'] = (X_train.mean(axis=1), X_test.mean(axis=1)) X_train = X_train.transpose(1, 0, 2).reshape( (-1, X_train.shape[-1])) self.features['se_5_5_expand'] = (X_train, X_test.mean(axis=1)) if iteration == 10: X_train, X_test = make_inference(self.se_extractor, self.X, X_test, self.device, max_len=10, num_eval=5) self.features['se_10_5'] = (X_train.mean(axis=1), X_test.mean(axis=1)) if iteration == 15: X_train, X_test = make_inference(self.se_extractor, self.X, X_test, self.device, max_len=15, num_eval=5) self.features['se_15_5'] = (X_train.mean(axis=1), X_test.mean(axis=1)) if iteration == 20: X_train, X_test = make_inference(self.se_extractor, self.X, X_test, self.device, max_len=10, num_eval=10) self.features['se_10_10'] = (X_train.mean(axis=1), X_test.mean(axis=1)) ##################################### Check for external results ############################################################ try: # TODO add interprocess filelock (concurrent read/write) for root_path in [ self.out_path_top1_2019, self.out_path_top3_2019 ]: ext_paths = glob.glob(root_path + '/*.pkl.lzma') for path in ext_paths: score, y_pred = joblib.load(path) name = os.path.basename(path).split('.')[0] name = f'{os.path.dirname(path)}_{name}' names = set([x[1] for x in self.predictions]) if name in names: continue print('extresult', score, name) self.predictions.append((score, name, y_pred)) except: pass ########################################### Train a model ######################################################### # Check whether the futures are ready if 'mg' not in self.features: ready, nready = ray.wait([self.X_music], timeout=0.1) if ready: X_tr, X_te = ray.get(self.X_music) X_tr = time_pooling(X_tr, 'mean') X_te = time_pooling(X_te, 'mean') assert len(X_tr.shape) == 2 and len(X_te.shape) == 2 print('>' * 400, 'MG ready') self.features['mg'] = (X_tr, X_te) self.pm.start(self.data_id, time_budget=TIME_BUDGET, seconds_per_step=10, max_t=5, reduction_factor=4) # Select candidates to train candidates = [] for model_suf in ['standartize', 'normalize', 'noop', 'sign_sqrt']: for fname in self.features: out_name = f'{fname}_{model_suf}' if out_name in self.features_completed: continue candidates.append((model_suf, fname, out_name)) # Train random candidate if candidates and np.random.rand() < 0.8: # sometimes skip this step r_idx = np.random.randint(0, len(candidates)) model_suf, fname, out_name = candidates[r_idx] prep = ClassDesc(Preprocessor, model_suf) X_train, X_test = self.features[fname] max_iter = 100 if len(X_train) < 1500 else 30 model = LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=max_iter) score, y_pred = fit_single(prep, model, self.cv, X_train, self.y, X_test) self.features_completed.add(out_name) self.predictions.append((score, out_name, y_pred)) return # one model per iteration if self.pm.executor is not None: pm_results = self.pm.executor.get_results()[0] if pm_results: pm_results = list(sorted(pm_results, key=lambda x: -x['score'])) pm_top_score = pm_results[0]['score'] top_predictions = list( sorted(self.predictions, key=lambda x: -x[0])) if pm_top_score > top_predictions[0][0] - 0.03: refit_seconds = 10 if self.iteration < 30 else 20 y_pred = self.pm.predict(X_test, refit_seconds=refit_seconds) name = f'pm_{pm_top_score}' if len(pm_results) > 1: pm_top2_score = pm_results[1]['score'] name = f'pm_{pm_top_score}_{pm_top2_score}' self.predictions.append((pm_top_score, name, y_pred)) if 'mg' in self.features: X1_train, X1_test = self.features['mg'] for model_suf in ['standartize', 'normalize', 'noop', 'sign_sqrt']: for fname in self.features: if 'expand' in fname: continue out_name = f'mg_{fname}_fus_{model_suf}' if out_name in self.features_completed: continue X2_train, X2_test = self.features[fname] pre = ClassDesc(Preprocessor, model_suf) post = ClassDesc(Preprocessor, model_suf) fus = StaticFusion(pre, post) X_train = fus.fit_transform(X1_train, X2_train) X_test = fus.transform(X1_test, X2_test) model = LogisticRegression(solver='lbfgs', multi_class='multinomial') score, y_pred = fit_single(None, model, self.cv, X_train, self.y, X_test) self.features_completed.add(out_name) self.predictions.append((score, out_name, y_pred)) return
return 1 def one_hot(df): df['delivery_method_1.0'] = (df['delivery_method'] == 1).astype(int) df['delivery_method_3.0'] = (df['delivery_method'] == 3).astype(int) df['delivery_method_nan'] = (df['delivery_method'] == np.nan).astype(int) df['has_header_1.0'] = (df['has_header'] == 1).astype(int) df['has_header_nan'] = (df['has_header'] == np.nan).astype(int) df['user_type_2.0'] = (df['user_type'] == 2).astype(int) df['user_type_3.0'] = (df['user_type'] == 3).astype(int) df['user_type_4.0'] = (df['user_type'] == 4).astype(int) df['user_type_5.0'] = (df['user_type'] == 5).astype(int) df['user_type_103.0'] = (df['user_type'] == 103).astype(int) df['user_type_nan'] = (df['user_type'] == np.nan).astype(int) return df.drop(columns=['delivery_method', 'has_header', 'user_type']) if __name__ == "__main__": with open('models/LRmodel.pkl', 'rb') as f: model = pickle.load(f) with open('models/LRmodelScaler.pkl', 'rb') as f: scaler = pickle.load(f) X, y = get_example_X_y('data/test_script_examples.csv', scaler) print(model.predict_proba(X)[np.random.randint(low=0, high=25)])
def train_logreg(trX, trY, vaX=None, vaY=None, teX=None, teY=None, penalty='l1', max_iter=100, C=2**np.arange(-8, 1).astype(np.float), seed=42, model=None, eval_test=True, neurons=None): """ slightly modified version of openai implementation https://github.com/openai/generating-reviews-discovering-sentiment/blob/master/utils.py if model is not None it doesn't train the model before scoring, it just scores the model """ # if only integer is provided for C make it iterable so we can loop over if not isinstance(C, collections.Iterable): C = list([C]) # extract features for given neuron indices if neurons is not None: trX = trX[:, neurons] if vaX is not None: vaX = vaX[:, neurons] if teX is not None: teX = teX[:, neurons] # Cross validation over C scores = [] if model is None: for i, c in enumerate(C): model = LogisticRegression(C=c, penalty=penalty, max_iter=max_iter, random_state=42 + i) model.fit(trX, trY) if vaX is not None: score = model.score(vaX, vaY) else: score = model.score(trX, trY) scores.append(score) del model c = C[np.argmax(scores)] model = LogisticRegression(C=c, penalty=penalty, max_iter=max_iter, random_state=42 + len(C)) model.fit(trX, trY) else: c = model.C # predict probabilities and get accuracy of regression model on train, val, test as appropriate # also get number of regression weights that are not zero. (number of features used for modeling) nnotzero = np.sum(model.coef_ != 0) scores = [] probs = [] train_score, train_probs = score_and_predict(model, trX, trY) scores.append(train_score * 100) probs.append(train_probs) if vaX is None: eval_data = trX val_score = train_score val_probs = train_probs else: eval_data = vaX val_score, val_probs = score_and_predict(model, vaX, vaY) scores.append(val_score * 100) probs.append(val_probs) eval_score = val_score eval_probs = val_probs if teX is not None and teY is not None: if eval_test: eval_score, eval_probs = score_and_predict(model, teX, teY) else: eval_probs = model.predict_proba(teX)[:, 1] scores.append(eval_score * 100) probs.append(eval_probs) return model, scores, probs, c, nnotzero
def model(features, test_features, encoding='ohe', n_folds=5): """Train and test a light gradient boosting model using cross validation. Parameters -------- features (pd.DataFrame): dataframe of training features to use for training a model. Must include the TARGET column. test_features (pd.DataFrame): dataframe of testing features to use for making predictions with the model. encoding (str, default = 'ohe'): method for encoding categorical variables. Either 'ohe' for one-hot encoding or 'le' for integer label encoding n_folds (int, default = 5): number of folds to use for cross validation Return -------- submission (pd.DataFrame): dataframe with `SK_ID_CURR` and `TARGET` probabilities predicted by the model. feature_importances (pd.DataFrame): dataframe with the feature importances from the model. valid_metrics (pd.DataFrame): dataframe with training and validation metrics (ROC AUC) for each fold and overall. """ # Extract the ids train_ids = features['SK_ID_CURR'] test_ids = test_features['SK_ID_CURR'] # Extract the labels for training labels = features['TARGET'] # Remove the ids and target features = features.drop(columns=['SK_ID_CURR', 'TARGET']) test_features = test_features.drop(columns=['SK_ID_CURR']) # One Hot Encoding if encoding == 'ohe': features = pd.get_dummies(features) test_features = pd.get_dummies(test_features) # Align the dataframes by the columns features, test_features = features.align(test_features, join='inner', axis=1) # No categorical indices to record cat_indices = 'auto' # Integer label encoding elif encoding == 'le': # Create a label encoder label_encoder = LabelEncoder() # List for storing categorical indices cat_indices = [] # Iterate through each column for i, col in enumerate(features): if features[col].dtype == 'object': # Map the categorical features to integers features[col] = label_encoder.fit_transform( np.array(features[col].astype(str)).reshape((-1, ))) test_features[col] = label_encoder.transform( np.array(test_features[col].astype(str)).reshape((-1, ))) # Record the categorical indices cat_indices.append(i) # Catch error if label encoding scheme is not valid else: raise ValueError("Encoding must be either 'ohe' or 'le'") print('Training Data Shape: ', features.shape) print('Testing Data Shape: ', test_features.shape) # Extract feature names feature_names = list(features.columns) # Convert to np arrays features = np.array(features) test_features = np.array(test_features) # Create the kfold object k_fold = KFold(n_splits=n_folds, shuffle=False, random_state=50) # Empty array for feature importances feature_importance_values = np.zeros(len(feature_names)) # Empty array for test predictions test_predictions = np.zeros(test_features.shape[0]) # Empty array for out of fold validation predictions out_of_fold = np.zeros(features.shape[0]) # Lists for recording validation and training scores valid_scores = [] train_scores = [] # Iterate through each fold for train_indices, valid_indices in k_fold.split(features): # Training data for the fold train_features, train_labels = features[train_indices], labels[ train_indices] # Validation data for the fold valid_features, valid_labels = features[valid_indices], labels[ valid_indices] # Create the model model = lgb.LGBMClassifier(n_estimators=10000, objective='binary', class_weight='balanced', learning_rate=0.05, reg_alpha=0.1, reg_lambda=0.1, subsample=0.8, n_jobs=-1, random_state=50) # Train the model model.fit(train_features, train_labels, eval_metric='auc', eval_set=[(valid_features, valid_labels), (train_features, train_labels)], eval_names=['valid', 'train'], categorical_feature=cat_indices, early_stopping_rounds=100, verbose=200) # Record the best iteration best_iteration = model.best_iteration_ # Record the feature importances feature_importance_values += model.feature_importances_ / k_fold.n_splits # Make predictions test_predictions += model.predict_proba( test_features, num_iteration=best_iteration)[:, 1] / k_fold.n_splits # Record the out of fold predictions out_of_fold[valid_indices] = model.predict_proba( valid_features, num_iteration=best_iteration)[:, 1] # Record the best score valid_score = model.best_score_['valid']['auc'] train_score = model.best_score_['train']['auc'] valid_scores.append(valid_score) train_scores.append(train_score) # Clean up memory gc.enable() del model, train_features, valid_features gc.collect() # Make the submission dataframe submission = pd.DataFrame({ 'SK_ID_CURR': test_ids, 'TARGET': test_predictions }) # Make the feature importance dataframe feature_importances = pd.DataFrame({ 'feature': feature_names, 'importance': feature_importance_values }) # Overall validation score valid_auc = roc_auc_score(labels, out_of_fold) # Add the overall scores to the metrics valid_scores.append(valid_auc) train_scores.append(np.mean(train_scores)) # Needed for creating dataframe of validation scores fold_names = list(range(n_folds)) fold_names.append('overall') # Dataframe of validation scores metrics = pd.DataFrame({ 'fold': fold_names, 'train': train_scores, 'valid': valid_scores }) return submission, feature_importances, metrics