def test_s3e_fit(): # small test data language_model = Path("samples/s3e/tiny_fasttext_model") corpus_path = Path("samples/s3e/tiny_corpus.txt") save_dir = Path("testsave/fitted_s3e/") do_lower_case = False batch_size = 2 use_gpu = False # Fit S3E on a corpus set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=use_gpu, use_amp=False) # Create a InferenceProcessor tokenizer = Tokenizer.load(pretrained_model_name_or_path=language_model, do_lower_case=do_lower_case) processor = InferenceProcessor(tokenizer=tokenizer, max_seq_len=128) # Create an AdaptiveModel language_model = LanguageModel.load(language_model) model = AdaptiveModel( language_model=language_model, prediction_heads=[], embeds_dropout_prob=0.1, lm_output_types=[], device=device) model, processor, s3e_stats = fit_s3e_on_corpus(processor=processor, model=model, corpus=corpus_path, n_clusters=3, pca_n_components=30, svd_postprocessing=True, min_token_occurrences=1) # save everything to allow inference without fitting everything again model.save(save_dir) processor.save(save_dir) with open(save_dir / "s3e_stats.pkl", "wb") as f: pickle.dump(s3e_stats, f) # Load model, tokenizer and processor directly into Inferencer inferencer = Inferencer(model=model, processor=processor, task_type="embeddings", gpu=use_gpu, batch_size=batch_size, extraction_strategy="s3e", extraction_layer=-1, s3e_stats=s3e_stats, num_processes=0) # Input basic_texts = [ {"text": "a man is walking on the street."}, {"text": "a woman is walking on the street."}, ] # Get embeddings for input text (you can vary the strategy and layer) result = inferencer.inference_from_dicts(dicts=basic_texts) assert result[0]["context"] == basic_texts[0]["text"] assert result[0]["vec"][0] - 0.00527727306941057 < 1e-6 assert result[0]["vec"][-2] - 0.06285100416478565 < 1e-6
def fit_s3e_on_corpus(processor, model, corpus, n_clusters=10, mean_removal=True, pca_removal=True, pca_n_components=300, pca_n_top_components=10, default_token_weight=1, min_token_occurrences=0, svd_postprocessing=False, use_gpu=False, batch_size=50): """ Pooling of word/token embeddings as described by Wang et al in the paper "Efficient Sentence Embedding via Semantic Subspace Analysis" (https://arxiv.org/abs/2002.09620) Adjusted their implementation from here: https://github.com/BinWang28/Sentence-Embedding-S3E This method fits the "model" on a custom corpus. This includes the derivation of token_weights depending on token occurences in the corpus, creation of the semantic clusters via k-means and a couple of pre-/post-processing steps to normalize the embeddings. The resulting objects can be saved or directly passed to the Inferencer to get the actual embeddings for your sentences. Note: Some operations like `mean_removal` imply changes on the AdaptiveModel or Processor. That's why we return them. :param processor: FARM Processor with a Tokenizer used for reading the corpus (e.g. Inference Processor) :param model: FARM AdaptiveModel with an embedding layer in the LM (currently only supporting 'WordEmbedding_LM' as a language model) :param corpus: Path to a text file or a str :param n_clusters: Number of clusters for S3E. The more clusters, the higher the dimensionality of the resulting embeddings. :param mean_removal: Bool, whether to remove the mean from the token embeddings (preprocessing) :param pca_removal: Bool, whether to remove pca components from the token embeddings (preprocessing) :param pca_n_components: int, how many PCA components to fit if `pca_removal` is enabled :param pca_n_top_components: int, how many top PCA components to remove if `pca_removal` is enabled :param default_token_weight: float, what weight to assign for tokens that are in vocab but not in corpus :param min_token_occurrences: int, mininum number of token occurrences in the corpus for keeping it in the vocab. Helps to shrink the model & speed it up. :param svd_postprocessing: Bool, whether to remove the top truncated SVD / LSA components from the sentence embeddings (postprocessing). Note: Requires creating all sentence embeddings once for the corpus slowing down this method substantially. Doesn't impact later inference speed though. :param use_gpu: bool, whether to use a GPU :param batch_size: int, size of batch for the inferencer (only needed when `svd_postprocessing` is enabled) :return: model, processor, s3e_stats """ from farm.infer import Inferencer from farm.modeling.tokenization import tokenize_with_metadata # Get tokens of corpus if isinstance(corpus, Path): logger.info("Reading corpus for fitting S3E ") with open(corpus, "r") as f: corpus = f.read() else: assert type(corpus) == str, "`corpus` must be of type str or Path()" tokenized_corpus = tokenize_with_metadata(corpus, processor.tokenizer)["tokens"] token_counts = dict(Counter(tokenized_corpus)) n_tokens = sum(token_counts.values()) # Trim vocab & embeddings to most frequent tokens (only to improve speed & ram consumption) model.language_model.trim_vocab(token_counts, processor, min_threshold=min_token_occurrences) # Normalize embeddings model.language_model.normalize_embeddings( zero_mean=mean_removal, pca_removal=pca_removal, pca_n_components=pca_n_components, pca_n_top_components=pca_n_top_components) normalized_word_embs = model.language_model.model.embeddings.cpu().numpy() # Get token weights token_weights = {} eps = 1e-3 for word, id in processor.tokenizer.vocab.items(): if word in token_counts: token_weights[id] = eps / (eps + token_counts[word] / n_tokens) else: # words that are in vocab but not present in corpus get the default weight token_weights[id] = default_token_weight # Construct Cluster weight_list = np.array(list(token_weights.values())) logger.info('Creating clusters for S3E embeddings') kmeans = KMeans(n_clusters=n_clusters, random_state=42).fit(normalized_word_embs, sample_weight=weight_list) s3e_stats = { "token_to_cluster": kmeans.labels_, "centroids": kmeans.cluster_centers_, "token_weights": token_weights, "svd_components": None } if svd_postprocessing: logger.info( 'Post processing sentence embeddings using principal component removal' ) # Input sentences = [{ "text": s } for s in corpus.split("\n") if len(s.strip()) > 0] # Get embeddings try: inferencer = Inferencer(model=model, processor=processor, task_type="embeddings", gpu=use_gpu, batch_size=batch_size, extraction_strategy="s3e", extraction_layer=-1, s3e_stats=s3e_stats) result = inferencer.inference_from_dicts(dicts=sentences) finally: inferencer.close_multiprocessing_pool() sentence_embeddings = [s["vec"] for s in result] sentence_embeddings = np.vstack(sentence_embeddings) # Principal Component Removal svd = TruncatedSVD(n_components=1, n_iter=7, random_state=0) svd.fit(sentence_embeddings) s3e_stats["svd_components"] = svd.components_ return model, processor, s3e_stats
def fit(language_model, corpus_path, save_dir, do_lower_case, batch_size=4, use_gpu=False): # Fit S3E on a corpus set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=use_gpu, use_amp=False) # Create a InferenceProcessor tokenizer = Tokenizer.load(pretrained_model_name_or_path=language_model, do_lower_case=do_lower_case) processor = InferenceProcessor(tokenizer=tokenizer, max_seq_len=128) # Create an AdaptiveModel language_model = LanguageModel.load(language_model) model = AdaptiveModel(language_model=language_model, prediction_heads=[], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) model, processor, s3e_stats = fit_s3e_on_corpus(processor=processor, model=model, corpus=corpus_path, n_clusters=10, pca_n_components=300, svd_postprocessing=True, min_token_occurrences=1) # save everything to allow inference without fitting everything again model.save(save_dir) processor.save(save_dir) with open(save_dir / "s3e_stats.pkl", "wb") as f: pickle.dump(s3e_stats, f) # Load model, tokenizer and processor directly into Inferencer inferencer = Inferencer(model=model, processor=processor, task_type="embeddings", gpu=use_gpu, batch_size=batch_size, extraction_strategy="s3e", extraction_layer=-1, s3e_stats=s3e_stats) # Input basic_texts = [ { "text": "a man is walking on the street." }, { "text": "a woman is walking on the street." }, ] # Get embeddings for input text (you can vary the strategy and layer) result = inferencer.inference_from_dicts(dicts=basic_texts) print(result) inferencer.close_multiprocessing_pool()
model=model, processor=processor, batch_size=4, gpu=True, # TODO: how to mix for multihead? task_type="classification") basic_texts = [ { "text": "Some text you want to classify" }, { "text": "A second sample" }, ] ret = inferencer.inference_from_dicts(basic_texts) logger.info(f"Result of inference: {ret}") logger.info(f"Evaluating on training set...") evaluator = Evaluator(data_loader=data_silo.get_data_loader("train"), tasks=processor.tasks, device=device) result = evaluator.eval(inferencer.model, return_preds_and_labels=True) evaluator.log_results(result, "Test", steps=len(data_silo.get_data_loader("test"))) inferencer.close_multiprocessing_pool() logger.info("PROCESSING FINISHED")