Esempio n. 1
0
def get_elmo_embedder():
    options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json'
    weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5'

    elmo_embedder = ElmoTokenEmbedder(options_file, weight_file)
    word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder})
    return word_embeddings
def load_elmo_embeddings(large=True):
    """
    Loads pre-trained ELMo embeddings ('large' model by default).
    
    Parameters
    ----------
    large: bool
        Set to True to load the Large ELMo model; False for small ELMo model
    
    Returns
    -------
    TextFieldEmbedder
    """
    if large:  # use the Large pre-trained model
        print("Loading LARGE ELMo..")
        options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json'
        weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5'

    else:  # use the Small pre-trained model
        print("Loading SMALL ELMo..")
        options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json'
        weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5'

    elmo_embedder = ElmoTokenEmbedder(options_file, weight_file)
    word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder})
    print("Pre-trained ELMo loaded..")
    return word_embeddings
Esempio n. 3
0
def get_embedder_info(
    embedder_type: str
) -> Tuple[TokenEmbedder, TokenIndexer, str, Dict[str, Any]]:
    embedder_type = embedder_type.lower()
    text_field_embedder_kwargs: Dict[str, Any] = {}
    if embedder_type == 'ner_elmo':
        return NERElmoTokenEmbedder(), ELMoTokenCharactersIndexer(
        ), text_field_embedder_kwargs
    elif embedder_type == 'elmo':
        return ElmoTokenEmbedder(ELMO_OPTIONS_FILE,
                                 ELMO_WEIGHT_FILE), ELMoTokenCharactersIndexer(
                                 ), text_field_embedder_kwargs
    elif embedder_type == 'bert':
        bert_embedder = PretrainedBertEmbedder(
            pretrained_model="bert-base-uncased",
            top_layer_only=True,  # conserve memory
        )
        token_indexer = PretrainedBertIndexer(
            pretrained_model="bert-base-uncased",
            max_pieces=512,  # max pieces allowed for positional embeddings
            do_lowercase=True,
            use_starting_offsets=True,
        )
        text_field_embedder_kwargs['allow_unmatched_keys'] = True
        text_field_embedder_kwargs['embedder_to_indexer_map'] = {
            "tokens": ["tokens", "tokens-offsets"]
        }

        return bert_embedder, token_indexer, text_field_embedder_kwargs
    else:
        raise Exception(f'Unknown embedder type: {embedder_type}')
Esempio n. 4
0
    def _run_test(self, requires_grad):
        options_file = os.path.join(FIXTURES, 'options.json')
        weight_file = os.path.join(FIXTURES, 'lm_weights.hdf5')
        embedder = ElmoTokenEmbedder(options_file,
                                     weight_file,
                                     requires_grad=requires_grad)
        batch_size = 3
        seq_len = 4
        char_ids = Variable(
            torch.from_numpy(
                numpy.random.randint(0, 262, (batch_size, seq_len, 50))))
        embeddings = embedder(char_ids)
        loss = embeddings.sum()
        loss.backward()

        elmo_grads = [
            param.grad for name, param in embedder.named_parameters()
            if '_elmo_lstm' in name
        ]
        if requires_grad:
            # None of the elmo grads should be None.
            assert all([grad is not None for grad in elmo_grads])
        else:
            # All of the elmo grads should be None.
            assert all([grad is None for grad in elmo_grads])
Esempio n. 5
0
def get_embeddings(embedder_type,
                   vocab,
                   embedding_dim=300,
                   bert_trainable=True):
    if embedder_type not in valid_embedders:
        raise Exception(f'Unknown embedder type {embedder_type}')
    vocab_size = vocab.get_vocab_size('tokens')
    token_embedders = {}
    if embedder_type == 'random':
        token_embedding = Embedding(vocab_size, embedding_dim, trainable=True)
        token_embedders['tokens'] = token_embedding
    if embedder_type in ['glove', 'elmo_and_glove']:
        weights = load_glove_weights(vocab)
        token_embedding = Embedding(vocab_size,
                                    embedding_dim,
                                    weight=weights,
                                    trainable=True)
        token_embedders['tokens'] = token_embedding
    if embedder_type in ['elmo', 'elmo_and_glove']:
        elmo_token_embedder = ElmoTokenEmbedder(
            'embeddings/elmo_2x4096_512_2048cnn_2xhighway_options.json',
            'embeddings/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5',
            do_layer_norm=False,
            dropout=0.5)
        token_embedders['elmo'] = elmo_token_embedder
    if 'bert' in embedder_type:
        token_embedders['bert'] = BertEmbedder(bert_type=embedder_type,
                                               trainable=bert_trainable)

    word_embeddings = BasicTextFieldEmbedder(token_embedders)
    return word_embeddings
def build_elmo_model(vocab: Vocabulary) -> Model:
    print("Building the model")
    vocab_size = vocab.get_vocab_size("tokens")
    embedding = ElmoTokenEmbedder()
    embedder = BasicTextFieldEmbedder(token_embedders={'bert_tokens': embedding})
    encoder = BagOfEmbeddingsEncoder(embedding_dim=embedder.get_output_dim(), averaged=True)
    
    return SimpleClassifier(vocab, embedder, encoder)
Esempio n. 7
0
def run_ELMo_RSA(stim_file, header=False, filter_file=None):

    EXP = data.Stim(stim_file, header, filter_file, VOCAB_FILE)

    #Get tokenizer
    tokenizer = WhitespaceTokenizer()

    #Load model
    ##ELMo OG
    elmo_weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5'
    elmo_options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json'

    #ELMo Small
    #elmo_weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5'
    #elmo_options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json'

    #ELMo Medium
    #elmo_weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5'
    #elmo_options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_options.json'

    #ELMo OG (5.5B)
    #elmo_weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5'
    #elmo_options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json'

    elmo_embedding = ElmoTokenEmbedder(options_file=elmo_options_file,
                                       weight_file=elmo_weight_file,
                                       dropout=0.0)
    embedder = BasicTextFieldEmbedder(
        token_embedders={'elmo_tokens': elmo_embedding})

    for x in range(len(EXP.SENTS)):
        sentences = list(EXP.SENTS[x])
        target = sentences[0]
        sentence = sentences[1]

        #GET BASELINE
        token_indexer = ELMoTokenCharactersIndexer()
        vocab = Vocabulary()

        target_tokens = tokenizer.tokenize(target)
        target_text_field = TextField(target_tokens,
                                      {'elmo_tokens': token_indexer})
        target_text_field.index(vocab)
        target_token_tensor = target_text_field.as_tensor(
            target_text_field.get_padding_lengths())
        target_tensor_dict = target_text_field.batch_tensors(
            [target_token_tensor])

        target_embedding = embedder(target_tensor_dict)[0]
        baseline = target_embedding[-1].data.cpu().squeeze()

        #GET SIMS
        sims = get_ELMo_sims(sentence, baseline, tokenizer, embedder)
        values = get_dummy_values(sentence)

        EXP.load_IT('elmo', x, values, False, sims)

    return EXP
Esempio n. 8
0
def build_model(options_file, weight_file):
    vocab = Vocabulary()
    iterator = BucketIterator(batch_size=config.batch_size, sorting_keys=[("tokens", "num_tokens")])
    iterator.index_with(vocab)

    elmo_embedder = ElmoTokenEmbedder(options_file, weight_file)
    word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder})
    encoder: Seq2VecEncoder = PytorchSeq2VecWrapper(nn.LSTM(word_embeddings.get_output_dim(), config.hidden_size, bidirectional=True, batch_first=True))
    model = BaselineModel(word_embeddings, encoder, vocab)

    return model, iterator, vocab
Esempio n. 9
0
    def _run_test(self, requires_grad):
        embedder = ElmoTokenEmbedder(self.options_file, self.weight_file, requires_grad=requires_grad)
        batch_size = 3
        seq_len = 4
        char_ids = torch.from_numpy(numpy.random.randint(0, 262, (batch_size, seq_len, 50)))
        for _ in range(2):
            embeddings = embedder(char_ids)
            loss = embeddings.sum()
            loss.backward()

            elmo_grads = [param.grad for name, param in embedder.named_parameters() if '_elmo_lstm' in name]
            if requires_grad:
                # None of the elmo grads should be None.
                assert all([grad is not None for grad in elmo_grads])
            else:
                # All of the elmo grads should be None.
                assert all([grad is None for grad in elmo_grads])
Esempio n. 10
0
    def __init__(self,
                 options_file: str,
                 weight_file: str,
                 vocab_to_cache: List[str],
                 do_layer_norm: bool = False,
                 dropout: float = 0.5,
                 requires_grad: bool = False,
                 projection_dim: int = None) -> None:
        super(ElmoWordEmbedding, self).__init__()

        self._elmo = ElmoTokenEmbedder(options_file=options_file,
                                       weight_file=weight_file,
                                       do_layer_norm=do_layer_norm,
                                       dropout=dropout,
                                       requires_grad=requires_grad,
                                       projection_dim=projection_dim,
                                       vocab_to_cache=vocab_to_cache)

        self._projection = self._elmo._projection
def load_elmo_model():
    elmo_embedders = ElmoTokenEmbedder(OPTION_FILE, WEIGHT_FILE)
    word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedders})

    encoder = PytorchSeq2VecWrapper(
        torch.nn.LSTM(word_embeddings.get_output_dim(),
                      HIDDEN_DIM,
                      bidirectional=True,
                      batch_first=True))

    vocabulary = Vocabulary()

    model = BaseModel(word_embeddings=word_embeddings,
                      encoder=encoder,
                      vocabulary=vocabulary)

    output_elmo_model_file = os.path.join(PRETRAINED_ELMO,
                                          "lstm_elmo_model.bin")
    model.load_state_dict(torch.load(output_elmo_model_file))
    return model
Esempio n. 12
0
def sequence_labelling():

    # Index each token as a sequence of character Ids (ELMo)
    token_indexers = {"tokens": ELMoTokenCharactersIndexer()}

    # Read the data
    reader = SequenceLabellingDatasetReader(token_indexers)
    training_data = reader.read(path='data/sequence_labelling/train.txt')
    validation_data = reader.read(path='data/sequence_labelling/test.txt')
    test_data = reader.read(path='data/sequence_labelling/test.txt')

    # Create a vocabulary
    vocabulary = Vocabulary.from_instances(training_data + validation_data +
                                           test_data)

    # Use ELMo embeddings
    elmo = ElmoTokenEmbedder(options_file=ELMO_OPTIONS_FILE,
                             weight_file=ELMO_WEIGHTS_FILE)

    embedder = BasicTextFieldEmbedder(token_embedders={"tokens": elmo})

    # Our text classifier will use a CNN encoder

    lstm_layer = LSTM(input_size=ELMO_EMBEDDING_DIM,
                      hidden_size=HIDDEN_SIZE,
                      bidirectional=True,
                      batch_first=True)
    lstm_encoder = PytorchSeq2SeqWrapper(module=lstm_layer)

    model = SequenceLabeller(vocabulary=vocabulary,
                             embedder=embedder,
                             encoder=lstm_encoder)

    print("\nModel :\n")
    print(model)

    # Training
    train_model(model, training_data, validation_data, vocabulary)

    # Evaluation
    evaluate_sequence_labelling_model(model, test_data)
Esempio n. 13
0
def get_model(vocab, params):
    emb_d = params["embedding_dim"]
    hidden_d = params["hidden_dim"]

    use_elmo_embeddings = params['use_elmo']
    use_lstm = params['use_lstm']
    n_layers = params["num_layers"]

    bidirectional = params['bidirectional']

    if use_elmo_embeddings:
        token_embedder = ElmoTokenEmbedder(ELMO_OPTIONS_FILE,
                                           ELMO_WEIGHTS_FILE)
    else:
        token_embedder = Embedding(
            num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=emb_d)

    word_embedder = BasicTextFieldEmbedder({"tokens": token_embedder})
    emb_d = word_embedder.get_output_dim()

    if use_lstm:
        encoder = PytorchSeq2SeqWrapper(
            torch.nn.LSTM(emb_d,
                          hidden_d,
                          num_layers=n_layers,
                          batch_first=True,
                          bidirectional=bidirectional))
    else:
        encoder = PytorchSeq2SeqWrapper(
            torch.nn.GRU(emb_d,
                         hidden_d,
                         num_layers=n_layers,
                         batch_first=True,
                         bidirectional=bidirectional))

    model = NerModel(word_embedder,
                     encoder,
                     vocab,
                     num_categories=(3 if params["dataset"] == "senti" else 4))
    return model
Esempio n. 14
0
    def _run_test_with_vocab_to_cache(self, requires_grad):
        vocab_to_cache = ['<pad>', 'hello', 'world']
        embedder = ElmoTokenEmbedder(self.options_file,
                                     self.weight_file,
                                     requires_grad=requires_grad,
                                     vocab_to_cache=vocab_to_cache)
        word_tensor = torch.LongTensor([[[1, 2]]])
        for _ in range(2):
            embeddings = embedder(word_tensor, word_tensor)
            loss = embeddings.sum()
            loss.backward()

            elmo_grads = [param.grad for name, param in embedder.named_parameters()
                          if '_elmo_lstm' in name and '_token_embedder' not in name]
            if requires_grad:
                # None of the elmo grads should be None.
                assert all([grad is not None for grad in elmo_grads])
            else:
                # All of the elmo grads should be None.
                assert all([grad is None for grad in elmo_grads])

            assert all([param.grad is None for name, param in embedder.named_parameters()
                        if '_token_embedder' in name])
def get_predictor():
    EMBEDDING_DIM = 128
    HIDDEN_DIM = 60  #128
    MAX_LEN = 70
    dropout = 0.25
    lstm_layers = 2
    #  pre-trained model
    options_file = (
        'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo'
        '/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json'
    )
    weight_file = (
        'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo'
        '/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5'
    )

    elmo_embedder = ElmoTokenEmbedder(options_file, weight_file)
    vocab = Vocabulary.from_files(data_dir +
                                  "vocabulary_allennlp_imdb_twoclass")
    word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder})

    elmo_embedding_dim = 256
    lstm = PytorchSeq2VecWrapper(
        torch.nn.LSTM(elmo_embedding_dim,
                      HIDDEN_DIM,
                      bidirectional=True,
                      num_layers=lstm_layers,
                      dropout=dropout,
                      batch_first=True))
    model = LstmTwoClassifier(word_embeddings, lstm, vocab)
    net = torch.load("model_allen_imdb_twoclass.th", map_location=str(device))
    model.load_state_dict(net)
    elmo_token_indexer = ELMoTokenCharactersIndexer()
    readerSentence = SentenceDatasetReader(
        token_indexers={'tokens': elmo_token_indexer})

    return SentimentPredictor(model, dataset_reader=readerSentence)
Esempio n. 16
0
def build_model(vocab: Vocabulary) -> Model:
    print("Building the model")
    vocab_size_tokens = vocab.get_vocab_size("tokens")
    vocab_size_chars = vocab.get_vocab_size("token_characters")

    embedder = BasicTextFieldEmbedder({"tokens": Embedding(embedding_dim=embedding_dim, pretrained_file=f"{cur_dir}/glove/glove.6B.200d.txt", trainable=False, num_embeddings=vocab_size_tokens, vocab=vocab),\
                                        "elmo": ElmoTokenEmbedder(weight_file="https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5", options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json", do_layer_norm=False, dropout=0.0),\
                                        "token_characters":TokenCharactersEncoder(embedding=Embedding(embedding_dim=16, num_embeddings=vocab_size_chars, vocab=vocab), \
                                                                                encoder=CnnEncoder(embedding_dim=16, num_filters=128, ngram_filter_sizes=[3]))})
    encoder = PytorchTransformer(input_dim=1352,
                                 num_layers=6,
                                 positional_encoding="sinusoidal")

    # embedder = BasicTextFieldEmbedder({"tokens": Embedding(embedding_dim=embedding_dim, num_embeddings=vocab_size)})
    # encoder = BagOfEmbeddingsEncoder(embedding_dim=embedding_dim)
    # embedder = BasicTextFieldEmbedder({"tokens": PretrainedTransformerMismatchedEmbedder("bert-large-uncased")})
    # encoder = LstmSeq2SeqEncoder(input_size=1024, hidden_size=1024, num_layers=2, dropout=0.5, bidirectional=True)

    if args.pseudo:
        return PseudoCrfTagger(vocab, embedder, encoder, \
                label_encoding="BIOUL", include_start_end_transitions=False, num_virtual_models = num_virtual_models)
    else:
        return CrfTagger(vocab, embedder, encoder, \
                label_encoding="BIOUL", include_start_end_transitions=False)
Esempio n. 17
0
    def embeddings_returner(self, vocab=None):
        '''
        Either the name of the pretrained model to use (e.g. bert-base-uncased),or the path to the .tar.gz
        file with the model weights.
        :param args: vocab_size and vocab is needed only when pretrained embeddings is used.
        :return: embedder
        '''
        '''
        "bert-base-uncased", do_lower_case=True
        "bert-base-cased" , do_lower_case=False
        https://github.com/huggingface/pytorch-transformers/issues/712
        https://qiita.com/uedake722/items/b7f4b75b4d77d9bd358b
        '''
        if self.embedding_strategy == 'bert':
            self.bertmodel_dir = ''
            if self.ifbert_use_whichmodel == 'general':
                self.bertmodel_dir += 'bert-base-uncased/'  # recomendded ver is uncased, in original repository
                self.bertmodel_relative_dirpath = self.bert_src_dir + self.bertmodel_dir

                # included in pytorch_transformers, so we replace it with model name itself
                self.bert_weight_filepath = copy.copy('bert-base-uncased')

            elif self.ifbert_use_whichmodel == 'scibert':
                self.bertmodel_dir += 'scibert_scivocab_uncased/'  # recomendded ver is uncased, in original repository
                self.bertmodel_relative_dirpath = self.bert_src_dir + self.bertmodel_dir
                self.bert_weight_filepath = self.bertmodel_relative_dirpath + 'weights.tar.gz'

            elif self.ifbert_use_whichmodel == 'biobert':
                self.bertmodel_dir += 'biobert_v1.1_pubmed/'  # currently cased version only supported
                self.bertmodel_relative_dirpath = self.bert_src_dir + self.bertmodel_dir
                self.bert_weight_filepath = self.bertmodel_relative_dirpath + 'weights.tar.gz'  # including bert_config.json and bin.

            # Load embedder
            bert_embedder = PretrainedBertEmbedder(
                pretrained_model=self.bert_weight_filepath,
                top_layer_only=self.bert_top_layer_only,
                requires_grad=self.emb_requires_grad)
            return bert_embedder, bert_embedder.get_output_dim(
            ), BasicTextFieldEmbedder({'tokens': bert_embedder},
                                      allow_unmatched_keys=True)

        elif self.embedding_strategy == 'elmo':
            if self.ifelmo_use_whichmodel == 'general':
                options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json'
                weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5'
            elif self.ifelmo_use_whichmodel == 'pubmed':
                options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/contributed/pubmed/elmo_2x4096_512_2048cnn_2xhighway_options.json'
                weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/contributed/pubmed/elmo_2x4096_512_2048cnn_2xhighway_weights_PubMed_only.hdf5'
            elif self.ifelmo_use_whichmodel == 'bioelmo':
                options_file = self.elmo_src_dir + 'BioELMo/weights/biomed_elmo_options.json'
                weight_file = self.elmo_src_dir + 'BioELMo/weights/biomed_elmo_weights.hdf5'
            else:
                options_file = -1
                weight_file = -1
            assert options_file != -1
            elmo_embedder = ElmoTokenEmbedder(
                options_file=options_file,
                weight_file=weight_file,
                requires_grad=self.emb_requires_grad)
            return elmo_embedder, elmo_embedder.get_output_dim(
            ), BasicTextFieldEmbedder({'tokens': elmo_embedder})

        elif self.embedding_strategy == 'pretrained':

            print('\nGloVe pretrained vocab loading\n')

            if 'glove' in self.args.ifpretrained_use_whichmodel:
                embedding_dim = 300
            else:
                embedding_dim = 200

            pretrain_emb_embedder = Embedding.from_params(
                vocab=vocab,
                params=Params({
                    'pretrained_file': self.glove_embeddings_file,
                    'embedding_dim': embedding_dim,
                    'trainable': False,
                    'padding_index': 0
                }))

            return pretrain_emb_embedder, pretrain_emb_embedder.get_output_dim(
            ), BasicTextFieldEmbedder({'tokens': pretrain_emb_embedder})
Esempio n. 18
0
def build_model(vocab: Vocabulary) -> Model:
    print("Building the model")

    EMBEDDING_DIM = 300
    HIDDEN_DIM = 300
    NUM_FILTERS = 60
    NGRAM_FILTER_SIZES = (2, 3, 4, 5, 6)
    #out_dim for char = len(NGRAM_FILTER_SIZES) * NUM_FILTERS
    F_OUT = 200

    elmo_options_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
    elmo_weight_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"

    elmo_embedding = ElmoTokenEmbedder(options_file=elmo_options_file,
                                       weight_file=elmo_weight_file)

    character_embedding = Embedding(vocab=vocab,
                                    embedding_dim=EMBEDDING_DIM,
                                    vocab_namespace='character_vocab')
    cnn_encoder = CnnEncoder(embedding_dim=EMBEDDING_DIM,
                             num_filters=NUM_FILTERS,
                             ngram_filter_sizes=NGRAM_FILTER_SIZES)
    token_encoder = TokenCharactersEncoder(character_embedding, cnn_encoder)

    pos_tag_embedding = Embedding(vocab=vocab,
                                  embedding_dim=EMBEDDING_DIM,
                                  vocab_namespace='pos_tag_vocab')

    ner_tag_embedding = Embedding(vocab=vocab,
                                  embedding_dim=EMBEDDING_DIM,
                                  vocab_namespace='ner_tag_vocab')

    word_embedding = Embedding(vocab=vocab,
                               embedding_dim=EMBEDDING_DIM,
                               vocab_namespace='token_vocab')

    utterance_embedder = BasicTextFieldEmbedder(
        token_embedders={
            'elmo_tokens': elmo_embedding,
            'token_characters': token_encoder,
            'pos_tags': pos_tag_embedding,
            'ner_tags': ner_tag_embedding
        })

    #slot embed
    slot_embedder = BasicTextFieldEmbedder(token_embedders={
        'elmo_tokens': elmo_embedding,
        'token_characters': token_encoder,
    })

    utterance_lstm = PytorchSeq2SeqWrapper(
        torch.nn.LSTM(2 * EMBEDDING_DIM + 1024 +
                      len(NGRAM_FILTER_SIZES) * NUM_FILTERS,
                      HIDDEN_DIM,
                      num_layers=2,
                      batch_first=True,
                      bidirectional=True))
    slot_lstm = PytorchSeq2SeqWrapper(
        torch.nn.LSTM(1024 + len(NGRAM_FILTER_SIZES) * NUM_FILTERS,
                      HIDDEN_DIM,
                      num_layers=2,
                      batch_first=True,
                      bidirectional=True))

    similarity = LinearMatrixAttention(tensor_1_dim=2 * HIDDEN_DIM,
                                       tensor_2_dim=2 * HIDDEN_DIM,
                                       combination="x,y,x*y",
                                       activation=Activation.by_name('tanh')())

    modeling_lstm = PytorchSeq2SeqWrapper(
        torch.nn.LSTM(
            2 * 5 * HIDDEN_DIM,  # bi-direction
            HIDDEN_DIM,
            num_layers=2,
            batch_first=True,
            bidirectional=True))

    #step1_utterance
    utterance_embedder2 = BasicTextFieldEmbedder(
        token_embedders={
            'elmo_tokens': elmo_embedding,
            'token_characters': token_encoder,
            'pos_tags': pos_tag_embedding,
            'ner_tags': ner_tag_embedding
        })
    utterance_lstm2 = PytorchSeq2SeqWrapper(
        torch.nn.LSTM(2 * EMBEDDING_DIM + 1024 +
                      len(NGRAM_FILTER_SIZES) * NUM_FILTERS,
                      HIDDEN_DIM,
                      num_layers=2,
                      batch_first=True,
                      bidirectional=True))

    ## FF to combines two lstm inputs
    final_linear_layer = FeedForward(2 * HIDDEN_DIM, 2, [HIDDEN_DIM, F_OUT],
                                     torch.nn.ReLU(), 0.3)
    #CRF model
    model = CrfTagger(vocab=vocab,
                      utterance_embedder=utterance_embedder,
                      utterance_embedder2=utterance_embedder2,
                      slot_embedder=slot_embedder,
                      utterance_encoder=utterance_lstm,
                      utterance_encoder2=utterance_lstm2,
                      slot_encoder=slot_lstm,
                      matrix_attention=similarity,
                      modeling_layer=modeling_lstm,
                      fc_ff_layer=final_linear_layer)
    return model
Esempio n. 19
0
def build_embeddings(args, vocab, pretrained_embs=None):
    ''' Build embeddings according to options in args '''
    d_emb, d_char = 0, args.d_char

    token_embedder = {}
    # Word embeddings
    if args.word_embs != 'none':
        if args.word_embs in ['glove', 'fastText'] and pretrained_embs is not None:
            log.info("\tUsing word embeddings from %s", args.word_embs_file)
            word_embs = pretrained_embs
            d_word = pretrained_embs.size()[-1]
        else:
            log.info("\tLearning word embeddings from scratch!")
            word_embs = None
            d_word = args.d_word

        embeddings = Embedding(vocab.get_vocab_size('tokens'), d_word,
                               weight=word_embs, trainable=False,
                               padding_index=vocab.get_token_index('@@PADDING@@'))
        token_embedder["words"] = embeddings
        d_emb += d_word
    else:
        log.info("\tNot using word embeddings!")

    # Handle cove
    if args.cove:
        sys.path.append(args.path_to_cove)
        try:
            from cove import MTLSTM as cove_lstm
            cove_emb = cove_lstm(n_vocab=vocab.get_vocab_size('tokens'),
                                 vectors=embeddings.weight.data)
            d_emb += 600
            log.info("\tUsing CoVe embeddings!")
        except ImportError:
            log.info("Failed to import CoVE!")
    else:
        cove_emb = None

    # Character embeddings
    if args.char_embs:
        log.info("\tUsing character embeddings!")
        char_embeddings = Embedding(vocab.get_vocab_size('chars'), d_char)
        filter_sizes = tuple([int(i) for i in args.char_filter_sizes.split(',')])
        char_encoder = CnnEncoder(d_char, num_filters=args.n_char_filters,
                                  ngram_filter_sizes=filter_sizes,
                                  output_dim=d_char)
        char_embedder = TokenCharactersEncoder(char_embeddings, char_encoder,
                                               dropout=args.dropout_embs)
        d_emb += d_char
        token_embedder["chars"] = char_embedder
    else:
        log.info("\tNot using character embeddings!")

    # Handle elmo
    if args.elmo:
        log.info("Loading ELMo from files:")
        log.info("ELMO_OPT_PATH = %s", ELMO_OPT_PATH)
        log.info("ELMO_WEIGHTS_PATH = %s", ELMO_WEIGHTS_PATH)
        if args.elmo_chars_only:
            log.info("\tUsing ELMo character CNN only!")
            #elmo_embedder = elmo_embedder._elmo._elmo_lstm._token_embedder
            elmo_embedder = ElmoCharacterEncoder(options_file=ELMO_OPT_PATH,
                                                 weight_file=ELMO_WEIGHTS_PATH,
                                                 requires_grad=False)
            d_emb += 512
        else:
            log.info("\tUsing full ELMo!")
            elmo_embedder = ElmoTokenEmbedder(options_file=ELMO_OPT_PATH,
                                              weight_file=ELMO_WEIGHTS_PATH,
                                              dropout=args.dropout)
            d_emb += 1024

        token_embedder["elmo"] = elmo_embedder

    embedder = BasicTextFieldEmbedder(token_embedder)
    assert d_emb, "You turned off all the embeddings, ya goof!"
    return d_emb, embedder, cove_emb
            Params({
                "pretrained_file": config["pre_trained_embedding"],
                "embedding_dim": config["pre_trained_embedding_dim"],
                "trainable": config["train_embedding"],
                "padding_index": 0,
                "sparse": config["sparse_gradient_embedding"]
            }))
    elif config["token_embedder_type"] == "fasttext":
        vocab = None  #FastTextVocab(config["fasttext_vocab_mapping"])
        tokens_embedder = FastTextEmbeddingBag(numpy.load(
            config["fasttext_weights"]),
                                               sparse=True)

    elif config["token_embedder_type"] == "elmo":
        vocab = None
        tokens_embedder = ElmoTokenEmbedder(config["elmo_options_file"],
                                            config["elmo_weights_file"])
    else:
        logger.error("token_embedder_type %s not known",
                     config["token_embedder_type"])
        exit(1)

    word_embedder = BasicTextFieldEmbedder({"tokens": tokens_embedder})

    if config["model"] == "knrm":
        model = KNRM(word_embedder,
                     n_kernels=config["knrm_kernels"]).cuda(cuda_device)

    elif config["model"] == "conv_knrm":
        model = Conv_KNRM(
            word_embedder,
            n_grams=config["conv_knrm_ngrams"],
Esempio n. 21
0
    batch_size=config.eval_batch_size,
    sorting_keys=[("text", "num_tokens")],
)
val_iterator.index_with(vocab)

if args.embedding_type == 'glove':
    param_dict = {
        "pretrained_file":
        "(https://nlp.stanford.edu/data/glove.6B.zip)#glove.6B.300d.txt",
        "embedding_dim": 300
    }
    params = Params(params=param_dict)
    token_embedding = Embedding.from_params(vocab=vocab, params=params)
elif args.embedding_type == 'elmo':
    token_embedding = ElmoTokenEmbedder(args.options_file,
                                        args.weights_file,
                                        requires_grad=args.finetune_embeddings)

word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

if args.encoder_type == 'bag':
    encoder = BagOfEmbeddingsEncoder(word_embeddings.get_output_dim())
elif args.encoder_type == 'lstm':
    encoder = PytorchSeq2VecWrapper(
        torch.nn.LSTM(word_embeddings.get_output_dim(),
                      config.hidden_sz,
                      bidirectional=True,
                      batch_first=True))

num_classes = vocab.get_vocab_size("labels")
decoder_input_dim = encoder.get_output_dim()
Esempio n. 22
0
def main():
    elmo_token_indexer = ELMoTokenCharactersIndexer()

    reader = Seq2SeqDatasetReader(
        source_tokenizer=WordTokenizer(),
        target_tokenizer=WordTokenizer(),
        source_token_indexers={'tokens': elmo_token_indexer},
        target_token_indexers={
            'tokens': SingleIdTokenIndexer(namespace='target_tokens')
        })

    train_dataset, test_dataset, dev_dataset = (
        reader.read(DATA_ROOT + "/" + fname) for fname in
        ["train_all_seq.txt", "test_all_seq.txt", "val_all_seq.txt"])

    vocab = Vocabulary.from_instances(train_dataset + dev_dataset +
                                      test_dataset,
                                      min_count={
                                          'tokens': 1,
                                          'target_tokens': 1
                                      })

    # en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
    #                              embedding_dim=256)
    # en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
    # embedding_dim=elmo_embedding_dim)
    #elmo_embedder = Elmo(options_file, weight_file, 2, dropout=0.5)
    elmo_embedder = ElmoTokenEmbedder(options_file, weight_file)
    # word_embeddings = BasicTextFieldEmbedder({'tokens': elmo_embedder})
    # en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
    # embedding_dim=256)
    source_embedder = BasicTextFieldEmbedder({"tokens": elmo_embedder})

    #Initializing the model
    max_decoding_steps = 20
    encoder = PytorchSeq2SeqWrapper(
        torch.nn.LSTM(elmo_embedding_dim, hidden_dim, batch_first=True))

    # encoder = StackedSelfAttentionEncoder(input_dim=elmo_embedding_dim, hidden_dim=hidden_dim, projection_dim=128, feedforward_hidden_dim=128, num_layers=1, num_attention_heads=8)
    attention = DotProductAttention()

    model = SimpleSeq2Seq(vocab,
                          source_embedder,
                          encoder,
                          max_decoding_steps,
                          target_embedding_dim=elmo_embedding_dim,
                          target_namespace='target_tokens',
                          attention=attention,
                          beam_size=8,
                          use_bleu=True)

    if USE_GPU: model.cuda()
    else: model

    # Training the model
    optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)
    iterator = BucketIterator(batch_size=32,
                              sorting_keys=[("source_tokens", "num_tokens")])
    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=dev_dataset,
                      patience=10,
                      num_epochs=1,
                      cuda_device=0 if USE_GPU else -1)

    for i in range(20):
        print('Epoch: {}'.format(i))
        trainer.train()

        predictor = SimpleSeq2SeqPredictor(model, reader)

        for instance in itertools.islice(dev_dataset, 10):
            print('SOURCE:', instance.fields['source_tokens'].tokens)
            print('GOLD:', instance.fields['target_tokens'].tokens)
            print('PRED:',
                  predictor.predict_instance(instance)['predicted_tokens'])

    #Saving the model
    with open("model_seq2seq.th", 'wb') as f:
        torch.save(model.state_dict(), f)

    vocab.save_to_files("vocabulary_seq2seq")
    predictor = SimpleSeq2SeqPredictor(model, reader)
    with open('predict_seq2seq.txt', 'w+') as f:
        for instance in itertools.islice(test_dataset, 10):
            preds = predictor.predict_instance(instance)['predicted_tokens']
            f.write(" ".join(preds) + "\n")