params_model = {'bsize': params.batch_size, 'word_emb_dim': params.word_emb_dim, 'enc_lstm_dim': params.enc_lstm_dim , 'pool_type': params.pool_type, 'dpout_model': params.dpout_model, 'version': params.model_version} encoder = InferSent(params_model) encoder.load_state_dict(torch.load(params.encoder_path)) encoder.set_w2v_path(params.vector_rep) if params.vocab_samples.isdigit() : print("Build vocab from K samples") encoder.build_vocab_k_words(K=int(params.vocab_samples)) else: print("Build vocab from full file") encoder.build_vocab(K=params.vocab_samples) print("========TEST encoder=======") print(encoder.encode(['the cat eats.'])) encoder.to(device) # model config config_nli_model = { 'n_words' : len(word_vec) , 'word_emb_dim' : params.word_emb_dim , 'enc_lstm_dim' : params.enc_lstm_dim , 'n_enc_layers' : params.n_enc_layers , 'dpout_model' : params.dpout_model , 'dpout_fc' : params.dpout_fc , 'fc_dim' : params.fc_dim , 'bsize' : params.batch_size , 'n_classes' : params.n_classes ,
def infersent( path_to_senteval: str, path_to_vectors: str, output_filepath: str = None, cuda_device: int = -1, prototyping_config: bool = False, verbose: bool = False, ) -> None: """Evaluates an InferSent model against the SentEval benchmark (see: https://github.com/facebookresearch/InferSent for information on the pre-trained model). Adapted from: https://github.com/facebookresearch/SentEval/blob/master/examples/infersent.py. """ from models import InferSent def prepare(params, samples): samples = _cleanup_batch(samples) params.infersent.build_vocab([" ".join(tokens) for tokens in samples], tokenize=False) def batcher(params, batch): batch = _cleanup_batch(batch) sentences = [" ".join(tokens) for tokens in batch] embeddings = params.infersent.encode(sentences, bsize=params.batch_size, tokenize=False) return embeddings # Determine the torch device device = _get_device(cuda_device) # Load InferSent model # TODO (John): Hardcoded these to move things along, but that should be fixed. V = 2 MODEL_PATH = "resources/encoder/infersent%s.pkl" % V params_model = { "bsize": 64, "word_emb_dim": 300, "enc_lstm_dim": 2048, "pool_type": "max", "dpout_model": 0.0, "version": V, } infersent = InferSent(params_model) infersent.load_state_dict(torch.load(MODEL_PATH)) infersent.to(device) # Load and initialize the model with word vectors infersent.set_w2v_path(path_to_vectors) trainable_params = sum(p.numel() for p in infersent.parameters() if p.requires_grad) typer.secho( (f"{SUCCESS} Loaded InferSent model {MODEL_PATH}" f" with {trainable_params} trainable parameters."), fg=typer.colors.GREEN, bold=True, ) # Performs a few setup steps and returns the SentEval params params_senteval = _setup_senteval(path_to_senteval, prototyping_config, verbose) params_senteval["infersent"] = infersent _run_senteval(params_senteval, path_to_senteval, batcher, prepare, output_filepath) return
class SentenceEncoder: """ Universal sentence encoder, based on https://github.com/facebookresearch/InferSent """ def __init__(self, state_path=None, state_dict=None): assert bool(state_path) != bool( state_dict), 'Either state_path or state_dict must be there' self.model = InferSent(config=MODEL_CONF) if state_path: log.info(f"Loading state from {state_path}") state = torch.load(state_path, map_location=device) else: state = state_dict assert 'model' in state and 'word_vec' in state # created by self.prepare() method self.model.load_state_dict(state['model']) self.model.word_vec = state['word_vec'] self.maybe_gpu() def encode(self, sentences, tokenize=True, **kwargs): return self.model.encode(sentences, tokenize=tokenize, **kwargs) def maybe_gpu(self, device=device): self.model = self.model.to(device) def to_cpu(self): self.model = self.model.to(cpu_device) @staticmethod def prepare(model_path: str, word_vecs: str, out_path: str, sentences: Union[str, List[str]] = None, max_vocab: int = 0): """ this method is for adapting the vocabulary, :param model_path: unadapted model state :param word_vecs: word vectors :param out_path: where to store the state :param sentences: training sentences for scanning the vocabulary :param max_vocab: maximum vocabulary size (optional) :return: """ assert bool(sentences) != bool( max_vocab), 'Either sentences or max_vocab should be given' model = InferSent(config=MODEL_CONF) log.info(f"Loading state from {out_path}") model.load_state_dict(torch.load(model_path)) log.info(f"Loading word vecs from {out_path}") model.set_w2v_path(word_vecs) if sentences: if type(sentences) is not list: sentences = list(read_lines(sentences)) log.info("Building vocabulary from sentences") model.build_vocab(sentences, tokenize=True) if max_vocab: log.info(f"Pruning vocabulary to top {max_vocab} types") model.build_vocab_k_words(K=max_vocab) log.info(f"Saving at {out_path}") state = SentenceEncoder._get_state(model) torch.save(state, out_path) @classmethod def _get_state(cls, model): if isinstance(model, cls): model = model.model # by default InferSent doesnt pickle word_vec, so this hack return {'model': model.state_dict(), 'word_vec': model.word_vec} def get_state(self): return self._get_state(self)