def train(
    batch_size: int = defaults.BATCH_SIZE,
    num_train_epochs: float = defaults.NUM_TRAIN_EPOCHS,
):
    train_features = pickle.load(
        open(defaults.data_filename(f"train_features", serializer="pkl"),
             "rb"))

    # Compute # train and warmup steps from batch size
    num_train_steps = int(len(train_features) / batch_size * num_train_epochs)

    estimator = create_estimator(num_train_steps=num_train_steps)

    # Create an input function for training. drop_remainder = True for using TPUs.
    train_input_fn = bert.run_classifier.input_fn_builder(
        features=train_features,
        seq_length=defaults.MAX_SEQ_LENGTH,
        is_training=True,
        drop_remainder=False,
    )

    print(f"Beginning Training!")
    current_time = pendulum.now()
    estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
    print("Training took time ", pendulum.now() - current_time)
Example #2
0
def evaluate(model_dir: str = defaults.MODEL_DIR):
    test_features = pickle.load(
        open(defaults.data_filename("test_features", serializer="pkl"), "rb")
    )

    test_input_fn = bert.run_classifier.input_fn_builder(
        features=test_features,
        seq_length=defaults.MAX_SEQ_LENGTH,
        is_training=False,
        drop_remainder=False,
    )

    estimator = create_estimator()
    print(estimator.evaluate(input_fn=test_input_fn, steps=None))
def create_train_test_features(
    tokenizer: tokenization.FullTokenizer
) -> Tuple[run_classifier.InputFeatures, run_classifier.InputFeatures]:
    train_input_examples, test_input_examples = (ParquetFile(
        data_filename(dataset_name)).to_pandas().sample(SAMPLE_SIZE).apply(
            create_bert_input_example,
            axis=1) for dataset_name in DATASET_NAMES)

    train_features, test_features = (
        run_classifier.convert_examples_to_features(input_examples, LABEL_LIST,
                                                    MAX_SEQ_LENGTH, tokenizer)
        for input_examples in (train_input_examples, test_input_examples))

    return train_features, test_features
    neg_df = load_directory_data(os.path.join(directory, "neg"))
    pos_df["polarity"] = 1
    neg_df["polarity"] = 0
    return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)


# Download and process the dataset files.
def download_and_load_datasets(
        force_download: bool = False) -> Tuple[pd.DataFrame, pd.DataFrame]:
    dataset = tf.keras.utils.get_file(
        fname="aclImdb.tar.gz",
        origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz",
        extract=True,
    )

    train_df, test_df = (load_dataset(
        os.path.join(os.path.dirname(dataset), "aclImdb", subdir))
                         for subdir in DATASET_NAMES)

    return train_df, test_df


if __name__ == "__main__":
    if not os.path.exists(DATA_DIR):
        os.makedirs(DATA_DIR)

    dataframes = download_and_load_datasets(force_download=FORCE_DOWNLOAD)

    for df, dataset_name in zip(dataframes, DATASET_NAMES):
        fastparquet.write(data_filename(dataset_name), df)
    return tokenization.FullTokenizer(vocab_file=vocab_file,
                                      do_lower_case=do_lower_case)


def create_train_test_features(
    tokenizer: tokenization.FullTokenizer
) -> Tuple[run_classifier.InputFeatures, run_classifier.InputFeatures]:
    train_input_examples, test_input_examples = (ParquetFile(
        data_filename(dataset_name)).to_pandas().sample(SAMPLE_SIZE).apply(
            create_bert_input_example,
            axis=1) for dataset_name in DATASET_NAMES)

    train_features, test_features = (
        run_classifier.convert_examples_to_features(input_examples, LABEL_LIST,
                                                    MAX_SEQ_LENGTH, tokenizer)
        for input_examples in (train_input_examples, test_input_examples))

    return train_features, test_features


if __name__ == "__main__":
    tokenizer = create_tokenizer_from_hub_module()
    train_features, test_features = create_train_test_features(tokenizer)
    train_features_file, test_features_file = (open(
        data_filename(f"{mode}_features", serializer="pkl"), "wb")
                                               for mode in ["train", "test"])
    pickle.dump(train_features, train_features_file)
    pickle.dump(test_features, test_features_file)
    print("Stored train and test features files=", train_features_file,
          test_features_file)