Esempio n. 1
0
def load_model_and_evaluate(state_dict: str, data: m.Posts, label:str, positive_class_weight:float):
    BATCH_SIZE = 8
    MAX_LEN = 264
    #EPOCHS = 10

    # ## MLflow instantiation
    IS_DEVELOPMENT = False
    mlflow_logger = m.MLFlowLogger(
        uri=TRACKING_URI,
        experiment=EXPERIMENT_NAME,
        is_dev=IS_DEVELOPMENT,
    )

    # ## Data loading
    normalize = lambda x: cleaning.normalize(x, url_emoji_dummy=False, pure_words=False)

    X_train, y_train = data.get_X_y('train')
    X_train = X_train.apply(normalize)
    df_train = pd.concat([X_train,y_train], axis=1)
    df_train.columns = ['text', label]

    X_val, y_val = data.get_X_y('val', balance_method='translate')
    X_val = X_val.apply(normalize)
    df_val = pd.concat([X_val,y_val], axis=1)
    df_val.columns = ['text', label]

    X_test, y_test = data.get_X_y('test', balance_method=None)
    X_test = X_test.apply(normalize)
    df_test = pd.concat([X_test,y_test], axis=1)
    df_test.columns = ['text', label]

    tokenizer = BertTokenizer.from_pretrained("deepset/gbert-base")
    train_data_loader = create_data_loader(df_train, label, tokenizer, MAX_LEN, BATCH_SIZE)
    val_data_loader = create_data_loader(df_val, label, tokenizer, MAX_LEN, BATCH_SIZE)
    test_data_loader = create_data_loader(df_test, label, tokenizer, MAX_LEN, BATCH_SIZE)

    # ## Instantiation
    model = BinaryClassifier().to(device)
    #optimizer = AdamW(model.parameters(), lr=learning_rate, correct_bias=False)
    mlflow_logger.add_param('optimizer', 'AdamW')
    # https://pytorch.org/docs/stable/generated/torch.nn.BCEWithLogitsLoss.html#torch.nn.BCEWithLogitsLoss
    # From the BCEWithLogitLoss() documentation:
    # > For example, if a dataset contains 100 positive and 300 negative examples of a single class, 
    # > then pos_weight for the class should be equal to 300/100=3 . 
    # > The loss would act as if the dataset contains 3*100=300 positive examples.
    loss_fn = nn.BCEWithLogitsLoss(pos_weight=torch.FloatTensor([positive_class_weight])).to(device)

    # Loading the model
    model.load_state_dict(torch.load(f"./models/{state_dict}.bin"))
    model.eval()


    #history = defaultdict(list)
    train_fbeta, train_metrics, train_params,  train_loss = eval_model(
        model,
        train_data_loader,
        loss_fn,
        device,
        'train',
    )
    logger.info(f'Train loss {train_loss} Fbeta {train_fbeta}')
    val_fbeta, val_metrics, val_params,  val_loss = eval_model(
        model,
        val_data_loader,
        loss_fn,
        device,
        'val',
    )
    logger.info(f'Val   loss {val_loss} Fbeta {val_fbeta}')
    test_fbeta, test_metrics, test_params,  test_loss = eval_model(
        model,
        test_data_loader,
        loss_fn,
        device,
        'test',
    )
    logger.info(f'Test  loss {test_loss} Fbeta {test_fbeta}')
    print()
    #history['train_fbeta'].append(train_fbeta)
    #history['train_loss'].append(train_loss)
    #history['val_fbeta'].append(val_fbeta)
    #history['val_loss'].append(val_loss)
    for k,v in train_metrics.items():
        mlflow_logger.add_metric(k, v)
    for k,v in train_params.items():
        mlflow_logger.add_param(k, v)

    for k,v in val_metrics.items():
        mlflow_logger.add_metric(k, v)
    for k,v in val_params.items():
        mlflow_logger.add_param(k, v)

    for k,v in test_metrics.items():
        mlflow_logger.add_metric(k, v)
    for k,v in test_params.items():
        mlflow_logger.add_param(k, v)

    #############################################
    #MLflow logging

    #constant_params = {
    #                'epochs': EPOCHS,
    #                'batch_size': BATCH_SIZE,
    #                'max_len': MAX_LEN,
    #            }
    mlflow_logger.add_tag("cycle4", True)
    mlflow_logger.add_param("normalization", 'norm')
    mlflow_logger.add_param("vectorizer", 'deepset/gbert-base')
    mlflow_logger.add_param("model", "deepset/gbert-base")
    # I'm re-using the grid_search_params field in order to not open too many mlflow columns
    #mlflow_logger.add_param("grid_search_params", str(constant_params)[:249]) 
    #mlflow_logger.add_param("lr", learning_rate)
    mlflow_logger.add_param('pos_weight', positive_class_weight)

    mlflow_logger.add_param("saved_model", state_dict)
    mlflow_logger.add_param("label", data.current_label)
    mlflow_logger.add_param("balance_method", data.balance_method)
    if data.balance_method:
        mlflow_logger.add_param("sampling_strategy", data.sampling_strategy)
    mlflow_logger.add_model(None)

    with mlflow.start_run(run_name='deepset/gbert-base') as run:
        mlflow_logger.log()
Esempio n. 2
0
        TARGET_LABELS = [
            'label_argumentsused',
            'label_discriminating',
            'label_inappropriate',
            'label_offtopic',
            'label_personalstories',
            'label_possiblyfeedback',
            'label_sentimentnegative',
            'label_sentimentpositive',
        ]

        IS_DEVELOPMENT = False

        mlflow_logger = m.MLFlowLogger(uri=TRACKING_URI,
                                       experiment=EXPERIMENT_NAME,
                                       is_dev=IS_DEVELOPMENT,
                                       params=mlflow_params,
                                       tags=mlflow_tags)
        training = m.Modeling(data, gs, mlflow_logger)
        for method, strat in trans_os.items():
            for strategy in strat:
                print(method, strategy)
                for label in TARGET_LABELS:
                    logger.info(f"-" * 20)
                    logger.info(f"Target: {label}")
                    data.set_label(label=label)
                    data.set_balance_method(balance_method=method,
                                            sampling_strategy=strategy)
                    training.train()
                    training.evaluate(["train", "val"])
                    #if True:
Esempio n. 3
0
def make_model(data:m.Posts, label:str, learning_rate:float, positive_class_weight:float):
    BATCH_SIZE = 8
    MAX_LEN = 264
    EPOCHS = 10

    # ## MLflow instantiation
    IS_DEVELOPMENT = False
    mlflow_logger = m.MLFlowLogger(
        uri=TRACKING_URI,
        experiment=EXPERIMENT_NAME,
        is_dev=IS_DEVELOPMENT,
    )

    # ## Data loading
    normalize = lambda x: cleaning.normalize(x, url_emoji_dummy=False, pure_words=False)

    X_train, y_train = data.get_X_y('train')
    X_train = X_train.apply(normalize)
    df_train = pd.concat([X_train,y_train], axis=1)
    df_train.columns = ['text', label]

    X_val, y_val = data.get_X_y('val', balance_method='translate')
    X_val = X_val.apply(normalize)
    df_val = pd.concat([X_val,y_val], axis=1)
    df_val.columns = ['text', label]

    #X_test, y_test = data.get_X_y('test')
    #X_test = X_test.apply(normalize)
    #df_test = pd.concat([X_test,y_test], axis=1)
    #df_test.columns = ['text', label]

    tokenizer = BertTokenizer.from_pretrained("deepset/gbert-base")
    train_data_loader = create_data_loader(df_train, label, tokenizer, MAX_LEN, BATCH_SIZE)
    val_data_loader = create_data_loader(df_val, label, tokenizer, MAX_LEN, BATCH_SIZE)
    #test_data_loader = create_data_loader(df_test, label, tokenizer, MAX_LEN, BATCH_SIZE)

    # ## Instantiation
    model = BinaryClassifier().to(device)
    optimizer = AdamW(model.parameters(), lr=learning_rate, correct_bias=False)
    mlflow_logger.add_param('optimizer', 'AdamW')
    total_steps = len(train_data_loader) * EPOCHS
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps,
        )
    # https://pytorch.org/docs/stable/generated/torch.nn.BCEWithLogitsLoss.html#torch.nn.BCEWithLogitsLoss
    # From the BCEWithLogitLoss() documentation:
    # > For example, if a dataset contains 100 positive and 300 negative examples of a single class, 
    # > then pos_weight for the class should be equal to 300/100=3 . 
    # > The loss would act as if the dataset contains 3*100=300 positive examples.
    loss_fn = nn.BCEWithLogitsLoss(pos_weight=torch.FloatTensor([positive_class_weight])).to(device)

    history = defaultdict(list)
    best_fbeta = 0
    t = datetime.now().strftime("%y%m%d_%H%M%S")
    for epoch in range(EPOCHS):
        logger.info(f'Epoch {epoch + 1}/{EPOCHS}')
        logger.info('-' * 10)
        train_fbeta, train_metrics, train_params, train_loss = train_epoch(
            model,
            train_data_loader,
            loss_fn,
            optimizer,
            device,
            scheduler,
            'train',
        )
        logger.info(f'Train loss {train_loss} Fbeta {train_fbeta}')
        val_fbeta, val_metrics, val_params,  val_loss = eval_model(
            model,
            val_data_loader,
            loss_fn,
            device,
            'val',
        )
        logger.info(f'Val   loss {val_loss} Fbeta {val_fbeta}')
        print()
        history['train_fbeta'].append(train_fbeta)
        history['train_loss'].append(train_loss)
        history['val_fbeta'].append(val_fbeta)
        history['val_loss'].append(val_loss)
        if val_fbeta > best_fbeta:
            file_name = f"./models/model_gbertbase_{label}_{t}.bin"
            torch.save(model.state_dict(), file_name)
            for k,v in val_metrics.items():
                mlflow_logger.add_metric(k, v)
            for k,v in val_params.items():
                mlflow_logger.add_param(k, v)
            for k,v in train_metrics.items():
                mlflow_logger.add_metric(k, v)
            for k,v in train_params.items():
                mlflow_logger.add_param(k, v)
            best_fbeta = val_fbeta

    #############################################
    #MLflow logging

    constant_params = {
                    'epochs': EPOCHS,
                    'batch_size': BATCH_SIZE,
                    'max_len': MAX_LEN,
                }
    mlflow_logger.add_tag("cycle4", True)
    mlflow_logger.add_param("normalization", 'norm')
    mlflow_logger.add_param("vectorizer", 'deepset/gbert-base')
    mlflow_logger.add_param("model", "deepset/gbert-base")
    # I'm re-using the grid_search_params field in order to not open too many mlflow columns
    mlflow_logger.add_param("grid_search_params", str(constant_params)[:249]) 
    mlflow_logger.add_param("lr", learning_rate)
    mlflow_logger.add_param('pos_weight', positive_class_weight)

    mlflow_logger.add_param("saved_model", f"model_gbertbase_{label}_{t}")
    mlflow_logger.add_param("label", data.current_label)
    mlflow_logger.add_param("balance_method", data.balance_method)
    if data.balance_method:
        mlflow_logger.add_param("sampling_strategy", data.sampling_strategy)
    mlflow_logger.add_model(None)

    with mlflow.start_run(run_name='deepset/gbert-base') as run:
        mlflow_logger.log()