def test_load_old_checkpoint(): dataset = AmazonReviewSentimentCrossLingualDataset() sha1sum_id = "4ba096cdf6bd76c06386f2c27140db055e59c91b" checkpoint_name = "mdeberta-v3-base-checkpoint" save_path = os.path.join(get_home_dir(), "checkpoints") file_path = os.path.join(save_path, f"{checkpoint_name}.zip") checkpoint_path = os.path.join(get_home_dir(), "checkpoints", checkpoint_name) if os.path.exists(save_path): shutil.rmtree(save_path) download( url= f"s3://automl-mm-bench/unit-tests-0.4/checkpoints/{checkpoint_name}.zip", path=file_path, sha1_hash=sha1sum_id, ) protected_zip_extraction( file_path, sha1_hash=sha1sum_id, folder=save_path, ) predictor = TextPredictor.load(checkpoint_path) verify_predictor_save_load(predictor, dataset.test_df) # continuous training predictor.fit( dataset.train_df, presets="multilingual", time_limit=10, hyperparameters={"optimization.top_k_average_method": "uniform_soup"}, ) verify_predictor_save_load(predictor, dataset.test_df)
def test_predictor_fit(key): train_data = load_pd.load(DATA_INFO[key]['train']) dev_data = load_pd.load(DATA_INFO[key]['dev']) label = DATA_INFO[key]['label'] eval_metric = DATA_INFO[key]['metric'] verify_proba = DATA_INFO[key]['verify_proba'] rng_state = np.random.RandomState(123) train_perm = rng_state.permutation(len(train_data)) valid_perm = rng_state.permutation(len(dev_data)) train_data = train_data.iloc[train_perm[:100]] dev_data = dev_data.iloc[valid_perm[:10]] predictor = TextPredictor(label=label, eval_metric=eval_metric) predictor.fit(train_data, hyperparameters=get_test_hyperparameters(), time_limit=30, seed=123) dev_score = predictor.evaluate(dev_data) verify_predictor_save_load(predictor, dev_data, verify_proba=verify_proba) # Test for continuous fit predictor.fit(train_data, hyperparameters=get_test_hyperparameters(), time_limit=30, seed=123) verify_predictor_save_load(predictor, dev_data, verify_proba=verify_proba) # Saving to folder, loading the saved model and call fit again (continuous fit) with tempfile.TemporaryDirectory() as root: predictor.save(root) predictor = TextPredictor.load(root) predictor.fit(train_data, hyperparameters=get_test_hyperparameters(), time_limit=30, seed=123)
def predict(args): if args.use_tabular: predictor = TabularPredictor.load(args.model_dir) else: predictor = TextPredictor.load(args.model_dir) test_prediction = predictor.predict(args.test_file, as_pandas=True) if args.exp_dir is None: args.exp_dir = '.' test_prediction.to_csv(os.path.join(args.exp_dir, 'test_prediction.csv'))
def load(cls, path: str, reset_paths=True, verbose=True): model = super().load(path=path, reset_paths=reset_paths, verbose=verbose) if model._load_model: try_import_autogluon_text() from autogluon.text import TextPredictor model.model = TextPredictor.load( os.path.join(path, cls.nn_model_name)) model._load_model = None return model
def load(cls, path: str, reset_paths=True, verbose=True): try: from autogluon.text import TextPredictor except ImportError: raise ImportError(AG_TEXT_IMPORT_ERROR) model = super().load(path=path, reset_paths=reset_paths, verbose=verbose) model.model = TextPredictor.load(os.path.join(path, cls.nn_model_name)) return model
def verify_predictor_save_load(predictor, df, verify_proba=False, verify_embedding=True): with tempfile.TemporaryDirectory() as root: predictor.save(root) predictions = predictor.predict(df) loaded_predictor = TextPredictor.load(root) predictions2 = loaded_predictor.predict(df) npt.assert_equal(predictions, predictions2) if verify_proba: predictions_prob = predictor.predict_proba(df) predictions2_prob = loaded_predictor.predict_proba(df) npt.assert_equal(predictions_prob, predictions2_prob) if verify_embedding: embeddings = predictor.predict_features(df) assert embeddings.shape[0] == len(df)
def verify_predictor_save_load(predictor, df, verify_proba=False, verify_embedding=True): with tempfile.TemporaryDirectory() as root: predictor.save(root) predictions = predictor.predict(df, as_pandas=False) loaded_predictor = TextPredictor.load(root) predictions2 = loaded_predictor.predict(df, as_pandas=False) predictions2_df = loaded_predictor.predict(df, as_pandas=True) npt.assert_equal(predictions, predictions2) npt.assert_equal(predictions2, predictions2_df.to_numpy()) if verify_proba: predictions_prob = predictor.predict_proba(df, as_pandas=False) predictions2_prob = loaded_predictor.predict_proba(df, as_pandas=False) predictions2_prob_df = loaded_predictor.predict_proba(df, as_pandas=True) npt.assert_equal(predictions_prob, predictions2_prob) npt.assert_equal(predictions2_prob, predictions2_prob_df.to_numpy()) if verify_embedding: embeddings = predictor.extract_embedding(df) assert embeddings.shape[0] == len(df)
def main(args): tasks = { 'cola': ['CoLA.tsv', 'glue/cola/test.tsv'], 'sst': ['SST-2.tsv', 'glue/sst/test.tsv'], 'mrpc': ['MRPC.tsv', 'glue/mrpc/test.tsv'], 'sts': ['STS-B.tsv', 'glue/sts/test.tsv'], 'qqp': ['QQP.tsv', 'glue/qqp/test.tsv'], 'mnli_m': ['MNLI-m.tsv', 'glue/mnli/test_matched.tsv'], 'mnli_mm': ['MNLI-mm.tsv', 'glue/mnli/test_mismatched.tsv'], 'qnli': ['QNLI.tsv', 'glue/qnli/test.tsv'], 'rte': ['RTE.tsv', 'glue/rte/test.tsv'], 'wnli': ['WNLI.tsv', 'glue/wnli/test.tsv'], 'ax': ['AX.tsv', 'glue/rte_diagnostic/diagnostic.tsv'] } os.makedirs(args.save_dir, exist_ok=True) for task, (save_name, test_file_path) in tasks.items(): if task == 'ax': # For AX, we need to load the mnli-m checkpoint and run inference test_df = pd.read_csv(test_file_path, sep='\t', header=0) test_index = test_df['index'] predictor = TextPredictor.load(f'{args.prefix}_mnli_m') label_column = predictor.label predictions = predictor.predict(test_df) else: test_index = get_test_index(test_file_path) prediction_df = pd.read_csv( f'{args.prefix}_{task}/test_prediction.csv', index_col=0) label_column = prediction_df.columns[0] predictions = prediction_df[label_column] if task == 'sts': predictions = np.clip(predictions, 0, 5) with open(os.path.join(args.save_dir, save_name), 'w') as of: of.write('index\t{}\n'.format(label_column)) for i in range(len(predictions)): of.write('{}\t{}\n'.format(test_index[i], predictions[i]))
def test_standalone_with_emoji(): import tempfile from unittest import mock requests_gag = mock.patch( 'requests.Session.request', mock.Mock(side_effect=RuntimeError( 'Please use the `responses` library to mock HTTP in your tests.' )) ) data = [] for i in range(50 * 3): data.append(('😁' * (i + 1), 'grin')) for i in range(30 * 3): data.append(('😃' * (i + 1), 'smile')) for i in range(20 * 3): data.append(('😉' * (i + 1), 'wink')) df = pd.DataFrame(data, columns=['data', 'label']) predictor = TextPredictor(label='label', verbosity=3) predictor.fit( df, hyperparameters=get_test_hyperparameters(), time_limit=5, seed=123, ) predictions1 = predictor.predict(df, as_pandas=False) with tempfile.TemporaryDirectory() as root: predictor.save(root, standalone=True) with requests_gag: # no internet connections offline_predictor = TextPredictor.load(root) predictions2 = offline_predictor.predict(df, as_pandas=False) npt.assert_equal(predictions1, predictions2)