Beispiel #1
0
def train(save_path: str = 'models/model.h5') -> NoReturn:
    ''' Train intent classification for Dawn based on features extracted from BERT
    Args:
        save_path (str): path to save the model
    '''
    CONFIG_PATH = 'models/LargeBert/bert_config.json'
    CHECKPOINT_PATH = 'models/LargeBert/bert_model.ckpt'
    DICT_PATH = 'models/LargeBert/vocab.txt'

    model = load_trained_model_from_checkpoint(
        CONFIG_PATH,
        CHECKPOINT_PATH,
        training=False,
        trainable=False,
        output_layer_num=4,
    )

    # keras.utils.plot_model(model, to_file='model.png', show_shapes=True)
    pool_layer = MaskedGlobalMaxPool1D(name='Pooling')(
        model.get_layer(name='Encoder-Output').output)
    out = Dense(32, activation='relu', name='Pre-Output')(pool_layer)
    output = Dense(units=N_CLASS, activation='softmax',
                   name='Final-Output')(out)
    model = Model(inputs=[
        model.get_layer(name='Input-Token').input,
        model.get_layer(name='Input-Segment').input
    ],
                  outputs=output)
    model.summary(line_length=120)

    opt = Adam(lr=0.0005, beta_1=0.9, beta_2=0.999, decay=0.01)
    model.compile(opt, loss='categorical_crossentropy', metrics=['acc'])
    checkpoint = ModelCheckpoint(save_path,
                                 verbose=1,
                                 monitor='val_loss',
                                 save_best_only=True,
                                 mode='auto')
    x_tokens, x_segments, y_in = load_data(dict_path=DICT_PATH)
    model.fit([x_tokens, x_segments],
              y_in,
              epochs=300,
              batch_size=32,
              callbacks=[checkpoint],
              validation_split=0.3,
              shuffle=True)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("seed", type=int)
    parser.add_argument("n_folds", type=int)

    args = parser.parse_args()

    SEED = args.seed
    N_FOLDS = args.n_folds

    seed_everything(SEED)

    X, y = load_data("data/bank-additional-full.csv")
    folds = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

    avg_score = 0

    for fold_idx, (train_idx, valid_idx) in enumerate(folds.split(X, y)):
        print("Fold:", fold_idx, flush=True)

        X_train, y_train = X.iloc[train_idx, :], y[train_idx]
        X_valid, y_valid = X.iloc[valid_idx, :], y[valid_idx]

        model = LogisticRegression(tol=0.014562448890118148,
                                   C=9.256722875165577,
                                   fit_intercept=True,
                                   class_weight="balanced",
                                   solver="newton-cg",
                                   max_iter=120,
                                   warm_start=True,
                                   random_state=SEED)
        model.fit(X_train, y_train)

        y_pred = model.predict_proba(X_valid).astype(float)[:, 1]
        score = roc_auc_score(list(y_valid), y_pred)
        avg_score += score

        print("logistic regression score:", score, flush=True)

    avg_score /= N_FOLDS

    print("average score:", avg_score, flush=True)
                    [unlabeled_pool, unlabeled_remained[:to_fill]])
                unlabeled_remained = unlabeled_remained[to_fill:]

    def predict(self, X):
        X1, X2 = X[:, self.features1], X[:, self.features2]
        proba1 = self.model1.predict_proba(X1)
        proba2 = self.model2.predict_proba(X2)
        ensemble: np.ndarray = proba1 + proba2
        return ensemble.argmax(axis=1)

    def score(self, X, y):
        return accuracy_score(y, self.predict(X))


if __name__ == '__main__':
    unlabeled_clinical_X, Ctr_X, Ctr_Y, Cval_X, Cval_Y, Ct_X, Ct_Y, unlabeled_genomic_X, Gtr_X, Gtr_Y, Gval_X, Gval_Y, Gt_X, Gt_Y = load_data(
        True)

    num_unlabeled_samples = len(unlabeled_genomic_X)
    num_features = Gtr_X.shape[1]
    unlabeled_y = np.ones(num_unlabeled_samples) * -1
    Gtr_X = np.concatenate([Gtr_X, unlabeled_genomic_X])
    Gtr_Y = np.concatenate([Gtr_Y, unlabeled_y])
    features = set(range(0, num_features))
    logger = init_logger(name='genomic_feature.log')
    best_score, best_features = 0, None
    for size in range(1, int(num_features / 2) + 1):
        for features1 in set(itertools.combinations(features, size)):
            features1 = set(features1)
            features2 = features - features1
            features1 = np.array(list(features1), dtype=np.int)
            features2 = np.array(list(features2), dtype=np.int)
Beispiel #4
0
def test_answer():
    X, Y = load_data()
    assert func(4) == 5
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

import xgboost as xgb
from src.preprocess import load_data
from src.utility import seed_everything

warnings.filterwarnings("ignore")

SEED = 123
N_FOLDS = 5

seed_everything(SEED)

X, y = load_data("data/bank-additional-full.csv")

folds = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)


def objective(args):
    avg_score = 0

    for train_idx, valid_idx in folds.split(X, y):
        X_train, y_train = X.iloc[train_idx, :], y[train_idx]
        X_valid, y_valid = X.iloc[valid_idx, :], y[valid_idx]

        model = LogisticRegression(n_jobs=-1, random_state=SEED, **args)
        model.fit(X_train, y_train)

        y_pred = model.predict_proba(X_valid).astype(float)[:, 1]
        # Initialize layers
        activations = [X]

        for i in range(self.n_layers_ - 1):
            activations.append(np.empty((X.shape[0], layer_units[i + 1])))
        # forward propagate
        self._forward_pass(activations, self.coefs_, self.intercepts_)
        y_pred = activations[-1]
        if self.n_outputs_ == 1:
            y_pred = y_pred.ravel()
        return self.label_binarizer.inverse_transform(y_pred)

    def validate_input(self, X, y):
        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
                         multi_output=True)
        if y.ndim == 2 and y.shape[1] == 1:
            y = column_or_1d(y, warn=True)
        classes = unique_labels(y)
        self.label_binarizer = LabelBinarizer()
        self.label_binarizer.fit(classes)
        y = self.label_binarizer.transform(y)
        return X, y


if __name__ == '__main__':
    Ctr_X, Ctr_Y, Cval_X, Cval_Y, Ct_X, Ct_Y, Gtr_X, Gtr_Y, Gval_X, Gval_Y, Gt_X, Gt_Y = load_data()
    goamlp_ctr = GOAMultilayerPerceptron(N=100, x_val=Cval_X, y_val=Cval_Y, hidden_layer_sizes=[70], max_iter=5000, random_state=1)
    classify(goamlp_ctr, Ctr_X, Ctr_Y, Cval_X, Cval_Y, "GOAMLPClassifier", "clinical")
    goamlp_gtr = GOAMultilayerPerceptron(N=10000, x_val=Gval_X, y_val=Gval_Y, hidden_layer_sizes=[36], max_iter=50, random_state=1)
    classify(goamlp_gtr, Gtr_X, Gtr_Y, Gval_X, Gval_Y, "GOAMLPClassifier", "genetic")
Beispiel #7
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("seed", type=int)
    parser.add_argument("n_folds", type=int)
    parser.add_argument('-o', '--output_features', action="store_true")

    args = parser.parse_args()

    SEED = args.seed
    N_FOLDS = args.n_folds

    seed_everything(SEED)

    X, y = load_data("data/bank-additional-full.csv")
    folds = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

    avg_score = 0

    feature_importance = pd.DataFrame()
    feature_importance["Feature"] = X.columns
    feature_importance["Value"] = 0

    for fold_idx, (train_idx, valid_idx) in enumerate(folds.split(X, y)):
        print("Fold:", fold_idx, flush=True)

        X_train, y_train = X.iloc[train_idx, :], y[train_idx]
        X_valid, y_valid = X.iloc[valid_idx, :], y[valid_idx]

        model = xgb.XGBRegressor(n_estimators=486,
                                 max_depth=23,
                                 learning_rate=0.014315933846251667,
                                 booster="gbtree",
                                 tree_method="exact",
                                 gamma=0.7581225878358416,
                                 subsample=0.9340339327920703,
                                 colsample_bytree=0.6940772015224637,
                                 colsample_bylevel=0.559247335020885,
                                 colsample_bynode=0.7962006061767392,
                                 reg_alpha=0.6394227535273009,
                                 reg_lambda=0.19510772446939947,
                                 scale_pos_weight=0.8349805523658489,
                                 objective="reg:squarederror",
                                 random_state=SEED)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_valid).astype(float)
        score = roc_auc_score(list(y_valid), y_pred)
        avg_score += score

        print("xgboost score:", score, flush=True)

        current_importance = pd.DataFrame(zip(X.columns,
                                              model.feature_importances_),
                                          columns=["Feature", "Value"])
        feature_importance = pd.concat(
            (feature_importance,
             current_importance)).groupby("Feature", as_index=False).sum()

    avg_score /= N_FOLDS

    print("average score:", avg_score, flush=True)

    if args.output_features:
        feature_importance["Value"] *= 100 / feature_importance["Value"].sum()

        fig = plt.figure(figsize=(20, 20))
        fig.patch.set_facecolor("white")
        sns.set(style="whitegrid")
        sns.barplot(x="Value",
                    y="Feature",
                    data=feature_importance.sort_values(by="Value",
                                                        ascending=False))
        plt.title("Feature importance (%)")
        plt.tight_layout()
        plt.show()