def train( batch_size: int = defaults.BATCH_SIZE, num_train_epochs: float = defaults.NUM_TRAIN_EPOCHS, ): train_features = pickle.load( open(defaults.data_filename(f"train_features", serializer="pkl"), "rb")) # Compute # train and warmup steps from batch size num_train_steps = int(len(train_features) / batch_size * num_train_epochs) estimator = create_estimator(num_train_steps=num_train_steps) # Create an input function for training. drop_remainder = True for using TPUs. train_input_fn = bert.run_classifier.input_fn_builder( features=train_features, seq_length=defaults.MAX_SEQ_LENGTH, is_training=True, drop_remainder=False, ) print(f"Beginning Training!") current_time = pendulum.now() estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) print("Training took time ", pendulum.now() - current_time)
def evaluate(model_dir: str = defaults.MODEL_DIR): test_features = pickle.load( open(defaults.data_filename("test_features", serializer="pkl"), "rb") ) test_input_fn = bert.run_classifier.input_fn_builder( features=test_features, seq_length=defaults.MAX_SEQ_LENGTH, is_training=False, drop_remainder=False, ) estimator = create_estimator() print(estimator.evaluate(input_fn=test_input_fn, steps=None))
def create_train_test_features( tokenizer: tokenization.FullTokenizer ) -> Tuple[run_classifier.InputFeatures, run_classifier.InputFeatures]: train_input_examples, test_input_examples = (ParquetFile( data_filename(dataset_name)).to_pandas().sample(SAMPLE_SIZE).apply( create_bert_input_example, axis=1) for dataset_name in DATASET_NAMES) train_features, test_features = ( run_classifier.convert_examples_to_features(input_examples, LABEL_LIST, MAX_SEQ_LENGTH, tokenizer) for input_examples in (train_input_examples, test_input_examples)) return train_features, test_features
neg_df = load_directory_data(os.path.join(directory, "neg")) pos_df["polarity"] = 1 neg_df["polarity"] = 0 return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True) # Download and process the dataset files. def download_and_load_datasets( force_download: bool = False) -> Tuple[pd.DataFrame, pd.DataFrame]: dataset = tf.keras.utils.get_file( fname="aclImdb.tar.gz", origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", extract=True, ) train_df, test_df = (load_dataset( os.path.join(os.path.dirname(dataset), "aclImdb", subdir)) for subdir in DATASET_NAMES) return train_df, test_df if __name__ == "__main__": if not os.path.exists(DATA_DIR): os.makedirs(DATA_DIR) dataframes = download_and_load_datasets(force_download=FORCE_DOWNLOAD) for df, dataset_name in zip(dataframes, DATASET_NAMES): fastparquet.write(data_filename(dataset_name), df)
return tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) def create_train_test_features( tokenizer: tokenization.FullTokenizer ) -> Tuple[run_classifier.InputFeatures, run_classifier.InputFeatures]: train_input_examples, test_input_examples = (ParquetFile( data_filename(dataset_name)).to_pandas().sample(SAMPLE_SIZE).apply( create_bert_input_example, axis=1) for dataset_name in DATASET_NAMES) train_features, test_features = ( run_classifier.convert_examples_to_features(input_examples, LABEL_LIST, MAX_SEQ_LENGTH, tokenizer) for input_examples in (train_input_examples, test_input_examples)) return train_features, test_features if __name__ == "__main__": tokenizer = create_tokenizer_from_hub_module() train_features, test_features = create_train_test_features(tokenizer) train_features_file, test_features_file = (open( data_filename(f"{mode}_features", serializer="pkl"), "wb") for mode in ["train", "test"]) pickle.dump(train_features, train_features_file) pickle.dump(test_features, test_features_file) print("Stored train and test features files=", train_features_file, test_features_file)