コード例 #1
0
ファイル: train.py プロジェクト: sakami0000/kaggle_quora
def train_keras(train,
                valid,
                test,
                embedding_matrix,
                model,
                epochs=6,
                batch_size=512):

    train_x, train_y = train
    val_x, val_y = valid
    (test_x, ) = test

    # train
    model = model(embedding_matrix)
    model.fit(train_x,
              train_y,
              batch_size=batch_size,
              epochs=epochs,
              validation_data=(val_x, val_y))
    pred_val_y = model.predict([val_x], batch_size=batch_size, verbose=0)
    pred_test_y = model.predict([test_x], batch_size=batch_size, verbose=0)

    # search threshold
    best_th = threshold_search(val_y, pred_val_y)

    # predict
    preds = (pred_test_y > best_th).astype(int)

    return preds
コード例 #2
0
ファイル: train.py プロジェクト: sakami0000/kaggle_quora
def train_logreg(train, test, max_iter=40, n_splits=20):

    # TF-IDF feature
    tfidf = TfidfVectorizer(
        ngram_range=(1, 4),
        tokenizer=tokenize,
        min_df=3,
        max_df=0.9,
        strip_accents='unicode',
        use_idf=True,
        smooth_idf=True,
        sublinear_tf=True).fit(
            pd.concat([train['question_text'], test['question_text']]))

    train_x = tfidf.transform(train['question_text'])
    test_x = tfidf.transform(test['question_text'])
    train_y = train['target'].values

    # Naive Bayes scaling
    nb_transformer = NBFeaturer(alpha=1).fit(train_x, train_y)
    train_nb = nb_transformer.transform(train_x)
    test_nb = nb_transformer.transform(test_x)

    # train
    models = []
    train_meta = np.zeros(train_y.shape)
    test_meta = np.zeros(test_x.shape[0])

    splits = list(
        StratifiedKFold(n_splits=n_splits, shuffle=True,
                        random_state=seed).split(train, train_y))

    for idx, (train_idx, valid_idx) in enumerate(splits):
        x_train_fold = train_nb[train_idx]
        y_train_fold = train_y[train_idx]

        x_val_fold = train_nb[valid_idx]
        y_val_fold = train_y[valid_idx]

        model = LogisticRegression(solver='lbfgs',
                                   dual=False,
                                   class_weight='balanced',
                                   C=0.5,
                                   max_iter=max_iter)
        model.fit(x_train_fold, y_train_fold)
        models.append(model)

        valid_pred = model.predict_proba(x_val_fold)
        train_meta[valid_idx] = valid_pred[:, 1]
        test_meta += model.predict_proba(test_nb)[:, 1] / len(splits)

    # search threshold
    best_th = threshold_search(train_y, train_meta)

    # predict
    preds = (test_meta > best_th).astype(int)

    return preds
コード例 #3
0
def objective(trial):
    preds_val = []
    y_val = []
    for idx, (train_idx, val_idx) in enumerate(splits):
        print("Beginning fold {}".format(idx+1))
        train_X, train_y, val_X, val_y = X[train_idx], y[train_idx], X[val_idx], y[val_idx]
        # setting explore space
        optimizer = trial.suggest_categorical("optimizer", ["sgd", "adam", "rmsprop"])
        lr = trial.suggest_loguniform('learning_rate', 1e-5, 1e-1)
        lstm_units = int(trial.suggest_discrete_uniform("lstm_units", 16, 128, 16))
        dense_units = int(trial.suggest_discrete_uniform("dense_units", 16, 128, 16))
        model = modelutils.get_model(model_name, train_X.shape, n_outputs, lstm_units, dense_units)
        ckpt = ModelCheckpoint(os.path.join(result_dir, f'weights_{idx}.h5'), save_best_only=True, save_weights_only=True, verbose=1, monitor='val_loss', mode='min')
        reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10,
                                       verbose=1, mode='min', epsilon=0.0001)
        early = EarlyStopping(monitor="val_loss",
                          mode="min",
                          patience=25)
        model.compile(loss=loss, optimizer=optimizer, metrics=[utils.matthews_correlation])
        K.set_value(model.optimizer.lr, lr)
        history = model.fit(train_X, train_y, batch_size=batchsize, epochs=epoch, validation_data=[val_X, val_y], callbacks=[ckpt, reduce_lr, early])
        model.load_weights(os.path.join(result_dir, f'weights_{idx}.h5'))
        preds_val.append(model.predict(val_X, batch_size=512).flatten())
        y_val.append(val_y.flatten())

    # concatenates all and prints the shape    
    preds_val = np.concatenate(preds_val)
    #preds_val = np.concatenate(preds_val)[...,0]
    #y_val = np.concatenate(y_val).flatten()
    y_val = np.concatenate(y_val)
#     print(preds_val.shape)
#     print(y_val.shape)

    best_sarch_result = utils.threshold_search(y_val, preds_val)
    best_threshold = best_sarch_result['threshold']
    best_val_score = best_sarch_result['matthews_correlation']
    return 1 - best_val_score
コード例 #4
0
              batch_size=batchsize,
              epochs=epoch,
              validation_data=[val_X, val_y],
              callbacks=[ckpt])
    # loads the best weights saved by the checkpoint
    model.load_weights('weights_{}.h5'.format(idx))
    # Add the predictions of the validation to the list preds_val
    preds_val.append(model.predict(val_X, batch_size=512))
    # and the val true y
    y_val.append(val_y)

# concatenates all and prints the shape
preds_val = np.concatenate(preds_val)[..., 0]
y_val = np.concatenate(y_val)

best_sarch_result = utils.threshold_search(y_val, preds_val)
best_threshold = best_sarch_result['threshold']
best_val_score = best_sarch_result['matthews_correlation']

print(f'best validation score: {best_val_score}')

meta_test = pd.read_csv('./data/metadata_test.csv')
meta_test = meta_test.set_index(['signal_id'])

# First we daclarete a series of parameters to initiate the loading of the main data
# it is too large, it is impossible to load in one time, so we are doing it in dividing in 10 parts
first_sig = meta_test.index[0]
n_parts = 10
max_line = len(meta_test)
part_size = int(max_line / n_parts)
last_part = max_line % n_parts
コード例 #5
0
                    if isinstance(model, models.DomainAdversarialLSTM):
                        output = model(batch, 1.0)
                    else:
                        output = model(batch)

                prediction = torch.sigmoid(
                    output["logits"]).detach().cpu().numpy().reshape(-1)
                test_preds.append(prediction)

            test_preds_np = np.repeat(np.concatenate(test_preds), 3)
            fold_test_predictions.append(test_preds_np)

        oof_target = np.concatenate(oof_labels)
        oof_prediction = np.concatenate(oof_preds)

        search_result = utils.threshold_search(y_true=oof_target,
                                               y_proba=oof_prediction)
        print(f"SEED: {seed}", search_result)

        soft_prediction = np.squeeze(np.mean(fold_test_predictions, axis=0))
        hard_prediction = (soft_prediction >
                           search_result["threshold"]).astype(int)

        submission = pd.read_csv(
            "input/vsb-power-line-fault-detection/sample_submission.csv")
        submission["target"] = hard_prediction
        submission.to_csv(EXPERIMENT_DIR / "submission.csv", index=False)

        representations = []
        labels = []
        domain_labels = []
        for i in range(5):