def train_keras(train, valid, test, embedding_matrix, model, epochs=6, batch_size=512): train_x, train_y = train val_x, val_y = valid (test_x, ) = test # train model = model(embedding_matrix) model.fit(train_x, train_y, batch_size=batch_size, epochs=epochs, validation_data=(val_x, val_y)) pred_val_y = model.predict([val_x], batch_size=batch_size, verbose=0) pred_test_y = model.predict([test_x], batch_size=batch_size, verbose=0) # search threshold best_th = threshold_search(val_y, pred_val_y) # predict preds = (pred_test_y > best_th).astype(int) return preds
def train_logreg(train, test, max_iter=40, n_splits=20): # TF-IDF feature tfidf = TfidfVectorizer( ngram_range=(1, 4), tokenizer=tokenize, min_df=3, max_df=0.9, strip_accents='unicode', use_idf=True, smooth_idf=True, sublinear_tf=True).fit( pd.concat([train['question_text'], test['question_text']])) train_x = tfidf.transform(train['question_text']) test_x = tfidf.transform(test['question_text']) train_y = train['target'].values # Naive Bayes scaling nb_transformer = NBFeaturer(alpha=1).fit(train_x, train_y) train_nb = nb_transformer.transform(train_x) test_nb = nb_transformer.transform(test_x) # train models = [] train_meta = np.zeros(train_y.shape) test_meta = np.zeros(test_x.shape[0]) splits = list( StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed).split(train, train_y)) for idx, (train_idx, valid_idx) in enumerate(splits): x_train_fold = train_nb[train_idx] y_train_fold = train_y[train_idx] x_val_fold = train_nb[valid_idx] y_val_fold = train_y[valid_idx] model = LogisticRegression(solver='lbfgs', dual=False, class_weight='balanced', C=0.5, max_iter=max_iter) model.fit(x_train_fold, y_train_fold) models.append(model) valid_pred = model.predict_proba(x_val_fold) train_meta[valid_idx] = valid_pred[:, 1] test_meta += model.predict_proba(test_nb)[:, 1] / len(splits) # search threshold best_th = threshold_search(train_y, train_meta) # predict preds = (test_meta > best_th).astype(int) return preds
def objective(trial): preds_val = [] y_val = [] for idx, (train_idx, val_idx) in enumerate(splits): print("Beginning fold {}".format(idx+1)) train_X, train_y, val_X, val_y = X[train_idx], y[train_idx], X[val_idx], y[val_idx] # setting explore space optimizer = trial.suggest_categorical("optimizer", ["sgd", "adam", "rmsprop"]) lr = trial.suggest_loguniform('learning_rate', 1e-5, 1e-1) lstm_units = int(trial.suggest_discrete_uniform("lstm_units", 16, 128, 16)) dense_units = int(trial.suggest_discrete_uniform("dense_units", 16, 128, 16)) model = modelutils.get_model(model_name, train_X.shape, n_outputs, lstm_units, dense_units) ckpt = ModelCheckpoint(os.path.join(result_dir, f'weights_{idx}.h5'), save_best_only=True, save_weights_only=True, verbose=1, monitor='val_loss', mode='min') reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, verbose=1, mode='min', epsilon=0.0001) early = EarlyStopping(monitor="val_loss", mode="min", patience=25) model.compile(loss=loss, optimizer=optimizer, metrics=[utils.matthews_correlation]) K.set_value(model.optimizer.lr, lr) history = model.fit(train_X, train_y, batch_size=batchsize, epochs=epoch, validation_data=[val_X, val_y], callbacks=[ckpt, reduce_lr, early]) model.load_weights(os.path.join(result_dir, f'weights_{idx}.h5')) preds_val.append(model.predict(val_X, batch_size=512).flatten()) y_val.append(val_y.flatten()) # concatenates all and prints the shape preds_val = np.concatenate(preds_val) #preds_val = np.concatenate(preds_val)[...,0] #y_val = np.concatenate(y_val).flatten() y_val = np.concatenate(y_val) # print(preds_val.shape) # print(y_val.shape) best_sarch_result = utils.threshold_search(y_val, preds_val) best_threshold = best_sarch_result['threshold'] best_val_score = best_sarch_result['matthews_correlation'] return 1 - best_val_score
batch_size=batchsize, epochs=epoch, validation_data=[val_X, val_y], callbacks=[ckpt]) # loads the best weights saved by the checkpoint model.load_weights('weights_{}.h5'.format(idx)) # Add the predictions of the validation to the list preds_val preds_val.append(model.predict(val_X, batch_size=512)) # and the val true y y_val.append(val_y) # concatenates all and prints the shape preds_val = np.concatenate(preds_val)[..., 0] y_val = np.concatenate(y_val) best_sarch_result = utils.threshold_search(y_val, preds_val) best_threshold = best_sarch_result['threshold'] best_val_score = best_sarch_result['matthews_correlation'] print(f'best validation score: {best_val_score}') meta_test = pd.read_csv('./data/metadata_test.csv') meta_test = meta_test.set_index(['signal_id']) # First we daclarete a series of parameters to initiate the loading of the main data # it is too large, it is impossible to load in one time, so we are doing it in dividing in 10 parts first_sig = meta_test.index[0] n_parts = 10 max_line = len(meta_test) part_size = int(max_line / n_parts) last_part = max_line % n_parts
if isinstance(model, models.DomainAdversarialLSTM): output = model(batch, 1.0) else: output = model(batch) prediction = torch.sigmoid( output["logits"]).detach().cpu().numpy().reshape(-1) test_preds.append(prediction) test_preds_np = np.repeat(np.concatenate(test_preds), 3) fold_test_predictions.append(test_preds_np) oof_target = np.concatenate(oof_labels) oof_prediction = np.concatenate(oof_preds) search_result = utils.threshold_search(y_true=oof_target, y_proba=oof_prediction) print(f"SEED: {seed}", search_result) soft_prediction = np.squeeze(np.mean(fold_test_predictions, axis=0)) hard_prediction = (soft_prediction > search_result["threshold"]).astype(int) submission = pd.read_csv( "input/vsb-power-line-fault-detection/sample_submission.csv") submission["target"] = hard_prediction submission.to_csv(EXPERIMENT_DIR / "submission.csv", index=False) representations = [] labels = [] domain_labels = [] for i in range(5):