def test_s3e_fit(): # small test data language_model = Path("samples/s3e/tiny_fasttext_model") corpus_path = Path("samples/s3e/tiny_corpus.txt") save_dir = Path("testsave/fitted_s3e/") do_lower_case = False batch_size = 2 use_gpu = False # Fit S3E on a corpus set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=use_gpu, use_amp=False) # Create a InferenceProcessor tokenizer = Tokenizer.load(pretrained_model_name_or_path=language_model, do_lower_case=do_lower_case) processor = InferenceProcessor(tokenizer=tokenizer, max_seq_len=128) # Create an AdaptiveModel language_model = LanguageModel.load(language_model) model = AdaptiveModel( language_model=language_model, prediction_heads=[], embeds_dropout_prob=0.1, lm_output_types=[], device=device) model, processor, s3e_stats = fit_s3e_on_corpus(processor=processor, model=model, corpus=corpus_path, n_clusters=3, pca_n_components=30, svd_postprocessing=True, min_token_occurrences=1) # save everything to allow inference without fitting everything again model.save(save_dir) processor.save(save_dir) with open(save_dir / "s3e_stats.pkl", "wb") as f: pickle.dump(s3e_stats, f) # Load model, tokenizer and processor directly into Inferencer inferencer = Inferencer(model=model, processor=processor, task_type="embeddings", gpu=use_gpu, batch_size=batch_size, extraction_strategy="s3e", extraction_layer=-1, s3e_stats=s3e_stats, num_processes=0) # Input basic_texts = [ {"text": "a man is walking on the street."}, {"text": "a woman is walking on the street."}, ] # Get embeddings for input text (you can vary the strategy and layer) result = inferencer.inference_from_dicts(dicts=basic_texts) assert result[0]["context"] == basic_texts[0]["text"] assert result[0]["vec"][0] - 0.00527727306941057 < 1e-6 assert result[0]["vec"][-2] - 0.06285100416478565 < 1e-6
def fit(language_model, corpus_path, save_dir, do_lower_case, batch_size=4, use_gpu=False): # Fit S3E on a corpus set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=use_gpu, use_amp=False) # Create a InferenceProcessor tokenizer = Tokenizer.load(pretrained_model_name_or_path=language_model, do_lower_case=do_lower_case) processor = InferenceProcessor(tokenizer=tokenizer, max_seq_len=128) # Create an AdaptiveModel language_model = LanguageModel.load(language_model) model = AdaptiveModel(language_model=language_model, prediction_heads=[], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) model, processor, s3e_stats = fit_s3e_on_corpus(processor=processor, model=model, corpus=corpus_path, n_clusters=10, pca_n_components=300, svd_postprocessing=True, min_token_occurrences=1) # save everything to allow inference without fitting everything again model.save(save_dir) processor.save(save_dir) with open(save_dir / "s3e_stats.pkl", "wb") as f: pickle.dump(s3e_stats, f) # Load model, tokenizer and processor directly into Inferencer inferencer = Inferencer(model=model, processor=processor, task_type="embeddings", gpu=use_gpu, batch_size=batch_size, extraction_strategy="s3e", extraction_layer=-1, s3e_stats=s3e_stats) # Input basic_texts = [ { "text": "a man is walking on the street." }, { "text": "a woman is walking on the street." }, ] # Get embeddings for input text (you can vary the strategy and layer) result = inferencer.inference_from_dicts(dicts=basic_texts) print(result) inferencer.close_multiprocessing_pool()