예제 #1
0
def run_train(data_file_path, output_dir):
    print('***** Model output directory: {} *****'.format(output_dir))

    # get data from data loader
    train, _, _ = ContextualRelevance(data_file_path).get_data()
    print(train.columns)

    # Use the InputExample class from BERT's run_classifier code to create examples from the data
    train_InputExamples = train.apply(
        lambda x: run_classifier.InputExample(
            guid=
            None,  # Globally unique ID for bookkeeping, unused in this example
            text_a=x[DATA_COLUMN],
            text_b=x[ANSWER_COLUMN],
            label=x[LABEL_COLUMN]),
        axis=1)

    # get bert_code tokenizer form hub model
    tokenizer = create_tokenizer_from_hub_module(BERT_MODEL_HUB, False)
    print(tokenizer.tokenize("מריצים אימון..."))

    # Convert our train and test features to InputFeatures that BERT understands.
    train_features = run_classifier.convert_examples_to_features(
        train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)

    # Compute # train and warmup steps from batch size
    num_train_steps = int(len(train_features) / BATCH_SIZE * NUM_TRAIN_EPOCHS)
    num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

    # Specify outpit directory and number of checkpoint steps to save
    run_config = tf.compat.v1.estimator.RunConfig(
        model_dir=output_dir,
        save_summary_steps=SAVE_SUMMARY_STEPS,
        save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)

    model_fn = model_fn_builder(num_labels=len(label_list),
                                learning_rate=LEARNING_RATE,
                                num_train_steps=num_train_steps,
                                num_warmup_steps=num_warmup_steps,
                                bert_model_hub=BERT_MODEL_HUB)

    estimator = tf.compat.v1.estimator.Estimator(
        model_fn=model_fn,
        config=run_config,
        params={"batch_size": BATCH_SIZE})

    # Create an input function for training. drop_remainder = True for using TPUs.
    train_input_fn = run_classifier.input_fn_builder(features=train_features,
                                                     seq_length=MAX_SEQ_LENGTH,
                                                     is_training=True,
                                                     drop_remainder=False)

    print('Beginning Training!')
    current_time = datetime.now()

    estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
    print("Training took time ", datetime.now() - current_time)
예제 #2
0
def run(checkpoint_path, data_flie_path, is_baseline_data):
    # get model (make sure to change checkpoint according to the model in the configurations file)
    _, test = ContextualRelevance(data_flie_path,
                                  is_baseline=is_baseline_data).get_data()
    #if not is_baseline_data:
    #test = test[test.Is_Expanded_Term == 1]

    # get bert_code tokenizer form hub model
    tokenizer = create_tokenizer_from_hub_module(BERT_MODEL_HUB)

    test_InputExamples = test.apply(
        lambda x: run_classifier.InputExample(guid=None,
                                              text_a=x[DATA_COLUMN],
                                              text_b=x[ANSWER_COLUMN],
                                              label=x[LABEL_COLUMN]),
        axis=1)

    test_features = run_classifier.convert_examples_to_features(
        test_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)
    test_input_fn = run_classifier.input_fn_builder(features=test_features,
                                                    seq_length=MAX_SEQ_LENGTH,
                                                    is_training=False,
                                                    drop_remainder=False)

    model_fn = model_fn_builder(num_labels=len(label_list),
                                learning_rate=LEARNING_RATE,
                                num_train_steps=num_train_steps,
                                num_warmup_steps=num_warmup_steps,
                                bert_model_hub=BERT_MODEL_HUB)

    estimator = tf.compat.v1.estimator.Estimator(
        model_fn, params={"batch_size": BATCH_SIZE})
    tokenizer = create_tokenizer_from_hub_module(BERT_MODEL_HUB)

    metric_result = estimator.evaluate(input_fn=test_input_fn,
                                       steps=None,
                                       checkpoint_path=checkpoint_path)
    precision = metric_result['precision']
    recall = metric_result['recall']
    metric_result['F1'] = 2 * (precision * recall) / (precision + recall)
    return metric_result
예제 #3
0
train, test = aclImdb().get_data()
print(train.columns)

# Use the InputExample class from BERT's run_classifier code to create examples from the data
train_InputExamples = train.apply(
    lambda x: run_classifier.InputExample(
        guid=
        None,  # Globally unique ID for bookkeeping, unused in this example
        text_a=x[DATA_COLUMN],
        text_b=None,
        label=x[LABEL_COLUMN]),
    axis=1)

# get bert_code tokenizer form hub model
tokenizer = create_tokenizer_from_hub_module(BERT_MODEL_HUB)

test_InputExamples = test.apply(lambda x: run_classifier.InputExample(
    guid=None, text_a=x[DATA_COLUMN], text_b=None, label=x[LABEL_COLUMN]),
                                axis=1)

test_features = run_classifier.convert_examples_to_features(
    test_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)

test_input_fn = run_classifier.input_fn_builder(features=test_features,
                                                seq_length=MAX_SEQ_LENGTH,
                                                is_training=False,
                                                drop_remainder=False)

model_fn = model_fn_builder(num_labels=len(label_list),
                            learning_rate=LEARNING_RATE,