Example #1
0
    params_model = {'bsize': params.batch_size, 'word_emb_dim': params.word_emb_dim, 'enc_lstm_dim': params.enc_lstm_dim , 'pool_type': params.pool_type, 'dpout_model': params.dpout_model, 'version': params.model_version}
    encoder = InferSent(params_model)
    encoder.load_state_dict(torch.load(params.encoder_path))
    encoder.set_w2v_path(params.vector_rep)
    
    if params.vocab_samples.isdigit() :
        print("Build vocab from K samples")
        encoder.build_vocab_k_words(K=int(params.vocab_samples))
    else:
        print("Build vocab from full file")
        encoder.build_vocab(K=params.vocab_samples)

    print("========TEST encoder=======")
    print(encoder.encode(['the cat eats.']))
    
    encoder.to(device)
    
    


# model config
config_nli_model = {
    'n_words'        :  len(word_vec)         ,
    'word_emb_dim'   :  params.word_emb_dim   ,
    'enc_lstm_dim'   :  params.enc_lstm_dim   ,
    'n_enc_layers'   :  params.n_enc_layers   ,
    'dpout_model'    :  params.dpout_model    ,
    'dpout_fc'       :  params.dpout_fc       ,
    'fc_dim'         :  params.fc_dim         ,
    'bsize'          :  params.batch_size     ,
    'n_classes'      :  params.n_classes      ,
Example #2
0
def infersent(
    path_to_senteval: str,
    path_to_vectors: str,
    output_filepath: str = None,
    cuda_device: int = -1,
    prototyping_config: bool = False,
    verbose: bool = False,
) -> None:
    """Evaluates an InferSent model against the SentEval benchmark
    (see: https://github.com/facebookresearch/InferSent for information on the pre-trained model).
    Adapted from: https://github.com/facebookresearch/SentEval/blob/master/examples/infersent.py.
    """
    from models import InferSent

    def prepare(params, samples):
        samples = _cleanup_batch(samples)
        params.infersent.build_vocab([" ".join(tokens) for tokens in samples],
                                     tokenize=False)

    def batcher(params, batch):
        batch = _cleanup_batch(batch)
        sentences = [" ".join(tokens) for tokens in batch]
        embeddings = params.infersent.encode(sentences,
                                             bsize=params.batch_size,
                                             tokenize=False)
        return embeddings

    # Determine the torch device
    device = _get_device(cuda_device)

    # Load InferSent model
    # TODO (John): Hardcoded these to move things along, but that should be fixed.
    V = 2
    MODEL_PATH = "resources/encoder/infersent%s.pkl" % V
    params_model = {
        "bsize": 64,
        "word_emb_dim": 300,
        "enc_lstm_dim": 2048,
        "pool_type": "max",
        "dpout_model": 0.0,
        "version": V,
    }
    infersent = InferSent(params_model)
    infersent.load_state_dict(torch.load(MODEL_PATH))
    infersent.to(device)
    # Load and initialize the model with word vectors
    infersent.set_w2v_path(path_to_vectors)

    trainable_params = sum(p.numel() for p in infersent.parameters()
                           if p.requires_grad)
    typer.secho(
        (f"{SUCCESS} Loaded InferSent model {MODEL_PATH}"
         f" with {trainable_params} trainable parameters."),
        fg=typer.colors.GREEN,
        bold=True,
    )

    # Performs a few setup steps and returns the SentEval params
    params_senteval = _setup_senteval(path_to_senteval, prototyping_config,
                                      verbose)
    params_senteval["infersent"] = infersent
    _run_senteval(params_senteval, path_to_senteval, batcher, prepare,
                  output_filepath)

    return
Example #3
0
class SentenceEncoder:
    """
    Universal sentence encoder, based on https://github.com/facebookresearch/InferSent
    """
    def __init__(self, state_path=None, state_dict=None):
        assert bool(state_path) != bool(
            state_dict), 'Either state_path or state_dict must be there'
        self.model = InferSent(config=MODEL_CONF)
        if state_path:
            log.info(f"Loading state from {state_path}")
            state = torch.load(state_path, map_location=device)
        else:
            state = state_dict
        assert 'model' in state and 'word_vec' in state  # created by self.prepare() method

        self.model.load_state_dict(state['model'])
        self.model.word_vec = state['word_vec']
        self.maybe_gpu()

    def encode(self, sentences, tokenize=True, **kwargs):
        return self.model.encode(sentences, tokenize=tokenize, **kwargs)

    def maybe_gpu(self, device=device):
        self.model = self.model.to(device)

    def to_cpu(self):
        self.model = self.model.to(cpu_device)

    @staticmethod
    def prepare(model_path: str,
                word_vecs: str,
                out_path: str,
                sentences: Union[str, List[str]] = None,
                max_vocab: int = 0):
        """
        this method is for adapting the vocabulary,
        :param model_path: unadapted model state
        :param word_vecs: word vectors
        :param out_path: where to store the state
        :param sentences: training sentences for scanning the vocabulary
        :param max_vocab: maximum vocabulary size (optional)
        :return:
        """
        assert bool(sentences) != bool(
            max_vocab), 'Either sentences or max_vocab should be given'

        model = InferSent(config=MODEL_CONF)
        log.info(f"Loading state from {out_path}")

        model.load_state_dict(torch.load(model_path))
        log.info(f"Loading word vecs from {out_path}")
        model.set_w2v_path(word_vecs)
        if sentences:
            if type(sentences) is not list:
                sentences = list(read_lines(sentences))
            log.info("Building vocabulary from sentences")
            model.build_vocab(sentences, tokenize=True)
        if max_vocab:
            log.info(f"Pruning vocabulary to top {max_vocab} types")
            model.build_vocab_k_words(K=max_vocab)
        log.info(f"Saving at {out_path}")

        state = SentenceEncoder._get_state(model)
        torch.save(state, out_path)

    @classmethod
    def _get_state(cls, model):
        if isinstance(model, cls):
            model = model.model
        # by default InferSent doesnt pickle word_vec, so this hack
        return {'model': model.state_dict(), 'word_vec': model.word_vec}

    def get_state(self):
        return self._get_state(self)