Example #1
0
File: toy.py Project: nilesh-c/kgqa
    def setUp(self):
        self.reader = ToyReader()
        self.train_instances = self.reader.read("/home/IAIS/nchakrabor/nmt_data/toy_reverse/train/toy_train.txt")
        self.dev_instances = self.reader.read("/home/IAIS/nchakrabor/nmt_data/toy_reverse/dev/toy_dev.txt")
        self.vocab = Vocabulary.from_instances(self.train_instances + self.dev_instances)

        token_embedding = Embedding(num_embeddings=self.vocab.get_vocab_size('tokens') + 2,
                                    embedding_dim=256, padding_index=0)

        word_embeddings: TextFieldEmbedder = BasicTextFieldEmbedder({"tokens": token_embedding})

        encoder = PytorchSeq2SeqWrapper(nn.LSTM(input_size=word_embeddings.get_output_dim(),
                                                num_layers=2,
                                                hidden_size=256,
                                                bidirectional=True,
                                                dropout=0.4,
                                                batch_first=True))

        # self.set_up_model(model_params_file_path, dataset_sample_file_path)
        self.model = SimpleSeq2Seq(vocab=self.vocab,
                                   source_embedder=word_embeddings,
                                   encoder=encoder,
                                   target_embedding_dim=256,
                                   target_namespace='target_tokens',
                                   attention=DotProductAttention(),
                                   max_decoding_steps=25,
                                   beam_size=5,
                                   use_bleu=True
                                   )

        self.model.cuda(0)
Example #2
0
def get_elmo_embedder():
    options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json'
    weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5'

    elmo_embedder = ElmoTokenEmbedder(options_file, weight_file)
    word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder})
    return word_embeddings
Example #3
0
    def __init__(
            self,
            vocab,
            num_embeddings=None,  # Backwards compatibility.
            embedding_dim=50,
            rnn_dim=650,
            stack_dim=16,
            rnn_cell_type=torch.nn.LSTMCell,
            push_rnn_state=False,
            swap_push_pop=True,  # Backward compatibility.
            push_ones=True):

        super().__init__(vocab)
        self._vocab_size = vocab.get_vocab_size()
        if num_embeddings is None: num_embeddings = self._vocab_size
        embedding = torch.nn.Embedding(num_embeddings, embedding_dim)
        self._embedder = BasicTextFieldEmbedder({"tokens": embedding})

        self._rnn_dim = rnn_dim
        self._stack_dim = stack_dim
        self._push_rnn_state = push_rnn_state

        if rnn_cell_type == "gru":
            rnn_cell_type = torch.nn.GRUCell

        self._rnn_cell = rnn_cell_type(embedding_dim + stack_dim, rnn_dim)
        self._control_layer = ControlLayer(rnn_dim, stack_dim, vision=4)
        self._classifier = torch.nn.Linear(rnn_dim, 1)

        self._accuracy = BooleanAccuracy()
        self._pop_strength = Average()
        self._criterion = torch.nn.BCEWithLogitsLoss()

        self._push_ones = push_ones
        self._swap_push_pop = swap_push_pop
Example #4
0
def trainModel(train_dataset, validation_dataset, vocab):
    EMBEDDING_DIM = 6
    HIDDEN_DIM = 6
    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=EMBEDDING_DIM)
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
    lstm = PytorchSeq2SeqWrapper(torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, bidirectional=False, batch_first=True))
    model = LstmTagger(word_embeddings, lstm, vocab)
    if torch.cuda.is_available():
        cuda_device = 0
        model = model.cuda(cuda_device)
    else:
        cuda_device = -1
    # optimizer = optim.AdamW(model.parameters(), lr=1e-4, eps=1e-8)
    optimizer = optim.SGD(model.parameters(), lr=0.1)
    iterator = BucketIterator(batch_size=2, sorting_keys=[("tokens", "num_tokens")])
    iterator.index_with(vocab)
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=validation_dataset,
                      patience=10,
                      num_epochs=100,
                      cuda_device=cuda_device)
    trainer.train()
    return model
Example #5
0
    def __init__(
        self,
        vocab: Vocabulary,
        token_embedder: TokenEmbedder,
        num_labels: int,
        pool: str = 'sum',
    ) -> None:
        super().__init__(vocab)

        self.word_embedders = BasicTextFieldEmbedder(
            {"tokens": token_embedder})
        dim = token_embedder.get_output_dim()
        self.rotation = FeedForward(dim, 1, dim, torch.nn.ReLU())

        self.encoder = BagOfEmbeddingsEncoder(embedding_dim=dim,
                                              pool=pool,
                                              dropout=0.1)

        self.feedforward = FeedForward(dim * 2, 2, dim * 2, torch.nn.ReLU(),
                                       0.1)
        self.output_logit = FeedForward(dim * 2, 1, num_labels, PassThrough())

        self._num_labels = num_labels

        self._accuracy = CategoricalAccuracy()
        self._loss = torch.nn.CrossEntropyLoss()
Example #6
0
 def __init__(self, vocab: Vocabulary) -> None:
     super().__init__(vocab)
     weight = torch.ones(vocab.get_vocab_size(), 10)
     token_embedding = Embedding(
         num_embeddings=vocab.get_vocab_size(), embedding_dim=10, weight=weight, trainable=False
     )
     self.embedder = BasicTextFieldEmbedder({"words": token_embedding})
Example #7
0
    def __init__(self, vocab: Vocabulary, token_embedder: TokenEmbedder,
                 num_labels: int) -> None:
        super().__init__(vocab)

        self.word_embedders = BasicTextFieldEmbedder(
            {"tokens": token_embedder})
        self._encoder = LstmSeq2SeqEncoder(300, 300, 2)
        # self._encoder = PytorchTransformer(300, 3, 300, 4)

        self._matrix_attention = DotProductMatrixAttention()
        self._projection_feedforward = FeedForward(300 * 4, 1, 300,
                                                   torch.nn.ReLU(), 0.2)

        self._inference_encoder = LstmSeq2SeqEncoder(300, 300, 2)
        # self._inference_encoder = PytorchTransformer(300, 3, 300, 4)

        self.dropout = torch.nn.Dropout(0.3)
        self.rnn_input_dropout = InputVariationalDropout(0.3)

        self._output_feedforward = FeedForward(1200, 1, 300, torch.nn.ReLU(),
                                               0.2)
        self._output_logit = FeedForward(300, 1, num_labels, lambda x: x)

        self._num_labels = num_labels

        self._accuracy = CategoricalAccuracy()
        self._loss = torch.nn.CrossEntropyLoss()
Example #8
0
def build_seq2seq_model(flags,
                        data_reader,
                        vocab: Vocabulary,
                        source_namespace: str = 'source_tokens',
                        target_namespace: str = 'target_tokens') -> Model:
    source_embedding = Embedding(
        vocab.get_vocab_size(namespace=source_namespace),
        embedding_dim=flags.source_embedding_dim)
    source_embedder = BasicTextFieldEmbedder({'tokens': source_embedding})
    lstm_encoder = PytorchSeq2SeqWrapper(
        torch.nn.LSTM(flags.source_embedding_dim,
                      flags.encoder_hidden_dim,
                      batch_first=True,
                      bidirectional=flags.encoder_bidirectional))
    attention = DotProductAttention()
    model = SimpleSeq2Seq(vocab,
                          source_embedder,
                          lstm_encoder,
                          flags.max_decode_length,
                          target_embedding_dim=flags.decoder_hidden_dim,
                          target_namespace=target_namespace,
                          attention=attention,
                          beam_size=flags.beam_size,
                          use_bleu=True)
    return model
Example #9
0
    def __init__(self, num_authors: int, out_sz: int, vocab: Vocabulary):
        super().__init__(vocab)

        # init word embedding
        bert_embedder = PretrainedBertEmbedder(
            pretrained_model="bert-base-uncased",
            top_layer_only=True,  # conserve memory
        )
        self.word_embeddings = BasicTextFieldEmbedder(
            {"tokens": bert_embedder},
            # we'll be ignoring masks so we'll need to set this to True
            allow_unmatched_keys=True)

        self.encoder = BertSentencePooler(
            vocab, self.word_embeddings.get_output_dim())

        self.num_authors = num_authors

        # skills dim
        self.num_sk, self.sk_dim, self.time_dim = 20, 768, 32
        self.author_embeddings = nn.Parameter(torch.randn(
            num_authors, self.num_sk, self.sk_dim),
                                              requires_grad=True)  # (m, k, d)

        self.multihead_att = TempCtxAttention(h=8, d_model=self.sk_dim)

        self.attention = nn.Parameter(torch.randn(
            self.word_embeddings.get_output_dim(), self.sk_dim),
                                      requires_grad=True)
        # nn.Linear(self.word_embeddings.get_output_dim(), self.sk_dim)

        self.cohere_loss = CoherenceLoss(self.encoder.get_output_dim(), out_sz)
def main():
    reader = TatoebaSentenceReader()
    train_set = reader.read('data/mt/sentences.top10langs.train.tsv')
    dev_set = reader.read('data/mt/sentences.top10langs.dev.tsv')

    vocab = Vocabulary.from_instances(train_set,
                                      min_count={'tokens': 3})
    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=EMBEDDING_DIM)
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
    encoder = PytorchSeq2VecWrapper(
        torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

    positive_label = vocab.get_token_index('eng', namespace='labels')
    model = LstmClassifier(word_embeddings, encoder, vocab, positive_label=positive_label)

    optimizer = optim.Adam(model.parameters())

    iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")])

    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_set,
                      validation_dataset=dev_set,
                      num_epochs=3)

    trainer.train()
Example #11
0
def build_model(vocab: Vocabulary, wbrun: Any) -> Model:
    """
    Build the Model object, along with the embedder and encoder.

    :param vocab: The pre-instantiated vocabulary object.
    :return Model: The model object itself.
    """
    log.debug("Building the model.")
    # vocab_size = vocab.get_vocab_size("tokens")

    # TokenEmbedder object.
    bert_embedder = PretrainedTransformerEmbedder("bert-base-uncased")

    # TextFieldEmbedder that wraps TokenEmbedder objects. Each
    # TokenEmbedder output from one TokenIndexer--the data produced
    # by a TextField is a dict {names:representations}, hence
    # TokenEmbedders have corresponding names.
    embedder: TextFieldEmbedder = BasicTextFieldEmbedder(
        {"tokens": bert_embedder}
    )

    log.debug("Embedder built.")
    encoder = BertPooler("bert-base-uncased", requires_grad=True)
    # encoder = PytorchSeq2VecWrapper(torch.nn.LSTM(768,20,batch_first=True))
    log.debug("Encoder built.")

    return BertLinearClassifier(vocab, embedder, encoder, wbrun).cuda(0)
Example #12
0
    def __init__(self,
                 vocab: Vocabulary,
                 name_embedder: TextFieldEmbedder,
                 definition_embedder: TextFieldEmbedder,
                 name_encoder: Seq2VecEncoder,
                 definition_encoder: Seq2VecEncoder,
                 siamese_feedforward: FeedForward,
                 decision_feedforward: FeedForward,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super(OntoEmmaNN, self).__init__(vocab, regularizer)

        self.name_embedder = name_embedder
        self.distributed_name_embedder = BasicTextFieldEmbedder({
            k: TimeDistributed(v)
            for k, v in name_embedder._token_embedders.items()
        })
        self.definition_embedder = definition_embedder
        self.name_encoder = name_encoder
        self.definition_encoder = definition_encoder
        self.siamese_feedforward = siamese_feedforward
        self.decision_feedforward = decision_feedforward
        self.sigmoid = torch.nn.Sigmoid()
        self.accuracy = BooleanF1()
        self.loss = torch.nn.BCELoss()

        initializer(self)
Example #13
0
def build_model(vocab: Vocabulary, use_reg: bool = True) -> Model:
    print("Building the model")
    vocab_size = vocab.get_vocab_size("tokens")
    EMBED_DIMS = 200
    # turn the tokens into 200 dim embedding. Then, turn the embeddings into encodings
    pretrained_file = ''  # add pre_trained file here
    embedder = BasicTextFieldEmbedder({
        "tokens":
        Embedding(embedding_dim=EMBED_DIMS,
                  num_embeddings=vocab_size,
                  pretrained_file=pretrained_file,
                  trainable=False)
    })
    encoder = CnnEncoder(
        embedding_dim=EMBED_DIMS, ngram_filter_sizes=(2, 3, 5), num_filters=5
    )  # num_filters is a tad bit dangerous: the reason is that we have this many filters for EACH ngram f
    # encoder = BertPooler("bert-base-cased")
    # the output dim is just the num filters *len(ngram_filter_sizes)

    #     construct the regularizer applicator
    regularizer_applicator = None
    if use_reg:
        l2_reg = L2Regularizer()
        regexes = [  #("embedder", l2_reg),
            ("encoder", l2_reg), ("classifier", l2_reg)
        ]
        regularizer_applicator = RegularizerApplicator(regexes)

    return MortalityClassifier(vocab, embedder, encoder,
                               regularizer_applicator)
Example #14
0
def build_model(vocab: Vocabulary) -> Model: 
    print("Building the model")
    vocab_size_s = vocab.get_vocab_size("source_tokens")
    vocab_size_t = vocab.get_vocab_size("target_tokens") 
    
    bleu = BLEU(exclude_indices = {0,2,3})

    source_text_embedder = BasicTextFieldEmbedder({"source_tokens": Embedding(embedding_dim=embedding_dim, num_embeddings=vocab_size_s)})
    encoder = PytorchTransformer(input_dim=embedding_dim, num_layers=num_layers ,positional_encoding="sinusoidal", 
                            feedforward_hidden_dim=dff, num_attention_heads=num_head, positional_embedding_size = embedding_dim, dropout_prob = dropout)

    
    # target_text_embedder = BasicTextFieldEmbedder({"target_tokens":Embedding(embedding_dim=embedding_dim, num_embeddings=vocab_size_t)})
    target_text_embedder = Embedding(embedding_dim=embedding_dim, num_embeddings=vocab_size_t)
    decoder_net = StackedSelfAttentionDecoderNet(decoding_dim=embedding_dim, target_embedding_dim=embedding_dim, 
                                feedforward_hidden_dim=dff, num_layers=num_layers, num_attention_heads=num_head, dropout_prob = dropout)
    decoder_net.decodes_parallel=True
    decoder = AutoRegressiveSeqDecoder(
        vocab, decoder_net, max_len, target_text_embedder, 
        target_namespace="target_tokens", tensor_based_metric=bleu, scheduled_sampling_ratio=0.0)
    
    if args.pseudo:
        decoder = PseudoAutoRegressiveSeqDecoder(vocab, decoder_net, max_len, target_text_embedder, target_namespace="target_tokens", tensor_based_metric=bleu, scheduled_sampling_ratio=0.0, decoder_lin_emb = args.dec)
        return PseudoComposedSeq2Seq(vocab, source_text_embedder, encoder, decoder, num_virtual_models = num_virtual_models)
    else:
        decoder = AutoRegressiveSeqDecoder(vocab, decoder_net, max_len, target_text_embedder, target_namespace="target_tokens", tensor_based_metric=bleu, scheduled_sampling_ratio=0.0)
        return ComposedSeq2Seq(vocab, source_text_embedder, encoder, decoder)
Example #15
0
def load_embedding(args, vocab):
    # Randomly initialize vectors
    if args.embedding_type == "None":
        token_embedding = Embedding(
            num_embeddings=vocab.get_vocab_size('tokens'),
            embedding_dim=args.embedding_dim)

    # Load word2vec vectors
    elif args.embedding_type == "w2v":
        embedding_path = args.embedding_path
        save_weight_file = './{}_embedding_weight.pt'.format(args.dataset)
        if os.path.exists(save_weight_file):
            weight = torch.load(save_weight_file)
        else:
            weight = _read_pretrained_embeddings_file(
                embedding_path,
                embedding_dim=args.embedding_dim,
                vocab=vocab,
                namespace="tokens")
            torch.save(weight, save_weight_file)

        token_embedding = Embedding(
            num_embeddings=vocab.get_vocab_size('tokens'),
            embedding_dim=args.embedding_dim,
            weight=weight,
            trainable=True)
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
    return word_embeddings
Example #16
0
    def __init__(self, args, num_authors: int, out_sz: int,
                 vocab: Vocabulary):
        super().__init__(vocab)

        # init word embedding
        bert_embedder = PretrainedBertEmbedder(
            pretrained_model="bert-base-uncased",
            top_layer_only=True,  # conserve memory
        )
        self.word_embeddings = BasicTextFieldEmbedder({"tokens": bert_embedder},
                                                      # we'll be ignoring masks so we'll need to set this to True
                                                      allow_unmatched_keys=True)

        self.encoder = BertSentencePooler(vocab, self.word_embeddings.get_output_dim())

        self.num_authors = num_authors

        # skills dim
        self.num_sk, self.sk_dim = 20, 768
        self.author_embeddings = nn.Parameter(torch.randn(num_authors, self.sk_dim), requires_grad=True)  # (m, d)

        self.attention = nn.Parameter(torch.randn(self.word_embeddings.get_output_dim(), self.sk_dim), requires_grad=True)
        # nn.Linear(self.word_embeddings.get_output_dim(), self.sk_dim)

        self.tanh = nn.Tanh()
        self.softmax = nn.Softmax(dim=2)
        self.sigmoid = nn.Sigmoid()

        self.projection = nn.Linear(self.encoder.get_output_dim(), out_sz)
        # self.loss = nn.CrossEntropyLoss()

        # loss related
        # self.cohere_loss = CoherenceLoss(self.encoder.get_output_dim(), out_sz)
        self.triplet_loss = TripletLoss(self.encoder.get_output_dim(), out_sz)
    def __init__(
        self,
        vocab: Vocabulary,
        transformer_model: str = "roberta-large",
        override_weights_file: Optional[str] = None,
        override_weights_strip_prefix: Optional[str] = None,
        **kwargs
    ) -> None:
        super().__init__(vocab, **kwargs)

        self._text_field_embedder = PretrainedTransformerEmbedder(
            transformer_model,
            override_weights_file=override_weights_file,
            override_weights_strip_prefix=override_weights_strip_prefix,
        )
        self._text_field_embedder = BasicTextFieldEmbedder(
            {"tokens": self._text_field_embedder})
        self._pooler = BertPooler(
            transformer_model,
            override_weights_file=override_weights_file,
            override_weights_strip_prefix=override_weights_strip_prefix,
            dropout=0.1,
        )

        self._linear_layer = torch.nn.Linear(
            self._text_field_embedder.get_output_dim(), 1)
        self._linear_layer.weight.data.normal_(mean=0.0, std=0.02)
        self._linear_layer.bias.data.zero_()

        self._loss = torch.nn.CrossEntropyLoss()
        self._accuracy = CategoricalAccuracy()
Example #18
0
def running_NER():
    reader = PosDatasetReader()
    train_dataset = reader.read('../data/700_multi_data/600_ner_train.txt')
    validation_dataset = reader.read('../data/700_multi_data/66_ner_test.txt')

    vocab = Vocabulary.from_files("../model_store/vocabulary")

    # '''vocab part'''
    # train_1 = reader.read('../data/train/train.json')
    # train_2 = reader.read('../data/train/dev.json')

    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=EMBEDDING_DIM)
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
    lstm = PytorchSeq2SeqWrapper(
        torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))
    model = LstmTagger(word_embeddings, lstm, vocab)
    optimizer = optim.SGD(model.parameters(), lr=0.1)
    iterator = BucketIterator(batch_size=2,
                              sorting_keys=[("sentence", "num_tokens")])
    iterator.index_with(vocab)
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=validation_dataset,
                      patience=10,
                      num_epochs=1000)
    trainer.train()
Example #19
0
def generate_res_file():
    reader = PosDatasetReader()
    vocab = Vocabulary.from_files("../model_store/vocabulary")

    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=EMBEDDING_DIM)
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
    lstm = PytorchSeq2SeqWrapper(
        torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

    model2 = LstmTagger(word_embeddings, lstm, vocab)

    with open("../model_store/model.th", 'rb') as f:
        model2.load_state_dict(torch.load(f))
    predictor2 = SentenceTaggerPredictor(model2, dataset_reader=reader)

    train_read_file = open('../data/only_sentence/raw_test.json', 'r')
    train_write_file = open('../data/only_sentence/ner_test.json', 'w')
    for line in train_read_file:
        tag_logits2 = predictor2.predict(
            line.replace('.', '').replace(',', '').replace('\n',
                                                           ''))['tag_logits']
        tag_ids = np.argmax(tag_logits2, axis=-1)
        res = [model2.vocab.get_token_from_index(i, 'labels') for i in tag_ids]
        for i in range(len(res)):
            train_write_file.write(res[i] + ' ')
        # train_write_file.write(str(tag_logits2))
        train_write_file.write('\n')
        train_write_file.flush()
    train_read_file.close()
    train_write_file.close()
    print('finish')


# generate_res_file()
def create_model(
        vocab: Vocabulary,
        embedding_dim: int,
        max_filter_size: int,
        num_filters: int,
        output_dim: int,
        dropout: float,
):
    model = BasicClassifier(
        text_field_embedder=BasicTextFieldEmbedder(
            {
                "tokens": Embedding(
                    embedding_dim=embedding_dim,
                    trainable=True,
                    vocab=vocab
                )
            }
        ),
        seq2vec_encoder=CnnEncoder(
            ngram_filter_sizes=range(2, max_filter_size),
            num_filters=num_filters,
            embedding_dim=embedding_dim,
            output_dim=output_dim,
        ),
        dropout=dropout,
        vocab=vocab,
    )
    return model
Example #21
0
    def from_params(cls,
                    vocab: Vocabulary,
                    params: Params,
                    constructor_to_call=None,
                    constructor_to_inspect=None) -> 'BertModel':
        #initialize the class using JSON params
        embedder_params = params.pop("text_field_embedder")
        token_params = embedder_params.pop("tokens")
        embedding = PretrainedTransformerEmbedder.from_params(
            vocab=vocab, params=token_params)
        text_field_embedder = BasicTextFieldEmbedder(
            token_embedders={'tokens': embedding})
        #         text_field_embedder = TextFieldEmbedder.from_params(vocab, embedder_params)

        seq2vec_encoder_params = params.pop("seq2vec_encoder")
        seq2vec_encoder = Seq2VecEncoder.from_params(seq2vec_encoder_params)

        initializer = InitializerApplicator(
        )  #.from_params(params.pop("initializer", []))

        params.assert_empty(cls.__name__)
        #         print(cls)
        return cls(vocab=vocab,
                   text_field_embedder=text_field_embedder,
                   seq2vec_encoder=seq2vec_encoder,
                   initializer=initializer)
Example #22
0
 def build_model(vocab: Vocabulary) -> Model:
     vocab_size = vocab.get_vocab_size(
         "tokens")  # "tokens" from data_reader.token_indexers ??
     embedder = BasicTextFieldEmbedder(
         {"tokens": Embedding(embedding_dim=10, num_embeddings=vocab_size)})
     encoder = BagOfEmbeddingsEncoder(embedding_dim=10)
     return SimpleClassifier(vocab, embedder, encoder)
def load_elmo_embeddings(large=True):
    """
    Loads pre-trained ELMo embeddings ('large' model by default).
    
    Parameters
    ----------
    large: bool
        Set to True to load the Large ELMo model; False for small ELMo model
    
    Returns
    -------
    TextFieldEmbedder
    """
    if large:  # use the Large pre-trained model
        print("Loading LARGE ELMo..")
        options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json'
        weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5'

    else:  # use the Small pre-trained model
        print("Loading SMALL ELMo..")
        options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json'
        weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5'

    elmo_embedder = ElmoTokenEmbedder(options_file, weight_file)
    word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder})
    print("Pre-trained ELMo loaded..")
    return word_embeddings
Example #24
0
def build_model(
        vocab: Vocabulary,
        embedding_dim: int,
        pretrained_file: str = None,
        initializer: InitializerApplicator = None,
        regularizer: RegularizerApplicator = None
        ) -> Model:
    print("Building the model")
    vocab_size = vocab.get_vocab_size("tokens")
    word_vec = Embedding(embedding_dim=embedding_dim,
                          num_embeddings=vocab_size,
                          pretrained_file=pretrained_file,
                          vocab=vocab)
    embedding = BasicTextFieldEmbedder({"tokens": word_vec})

    # Use ELMo
    # options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json'
    # weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5'
    # elmo_embedder = ElmoTokenEmbedder(options_file, weight_file)
    # embedding = BasicTextFieldEmbedder({"tokens": elmo_embedder})

    # Use BERT
    # bert_embedder = PretrainedTransformerEmbedder(
    #     model_name='bert-base-uncased',
    #     max_length=512,
    #     train_parameters=False
    # )
    # embedding = BasicTextFieldEmbedder({"tokens": bert_embedder})

    encoder = BagOfEmbeddingsEncoder(embedding_dim=embedding_dim)
    return SimpleClassifier(vocab, embedding, encoder, initializer, regularizer=regularizer)
Example #25
0
def running_whole_model():
    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=EMBEDDING_DIM)
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

    iterator = BucketIterator(batch_size=BATCH_SIZE, sorting_keys=[("sentence", "num_tokens"),
                                                                   ("structures1", "num_tokens"),
                                                                   ("structures2", "num_tokens"),
                                                                   ("structures3", "num_tokens")])
    iterator.index_with(vocab)


    model = All_generating(embed_size=EMBEDDING_DIM,
                           word_embeddings=word_embeddings,
                           vocab=vocab,
                           num_of_candidates=7,
                           )

    # optimizer = adabound.AdaBound(model.parameters(), lr=lr, final_lr=0.1)
    optimizer = optim.Adam(model.parameters(), lr=lr)


    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=whole_train_dataset,
                      validation_dataset=whole_validation_dataset,
                      patience=5,
                      num_epochs=30)
    trainer.train()
Example #26
0
def build_model(vocab: Vocabulary) -> Model:
    print("Building the model")
    vocab_size = vocab.get_vocab_size("tokens")
    embedder = BasicTextFieldEmbedder(
        {"tokens": Embedding(embedding_dim=10, num_embeddings=vocab_size)})
    encoder = BagOfEmbeddingsEncoder(embedding_dim=10)
    return SimpleClassifier(vocab, embedder, encoder)
def build_adversarial_transformer_model(vocab: Vocabulary, transformer_model: str) -> Model:
    print("Building the model")
    vocab_size = vocab.get_vocab_size("tokens")
    embedding = PretrainedTransformerEmbedder(model_name=transformer_model)
    embedder = BasicTextFieldEmbedder(token_embedders={'bert_tokens': embedding})
    encoder = BertPooler(transformer_model)
    return SimpleClassifier(vocab, embedder, encoder)
Example #28
0
    def test_context_sequence_encoding(self):
        elmo_credbank_model_path = load_abs_path(
            os.path.join(
                os.path.dirname(__file__), '..', "resource", "embedding",
                "elmo_model",
                "elmo_credbank_2x4096_512_2048cnn_2xhighway_weights_10052019.hdf5"
            ))

        elmo_embedder = ElmoTokenEmbedder(
            options_file=
            "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json",
            weight_file=elmo_credbank_model_path,
            do_layer_norm=False,
            dropout=0.5)
        word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder})

        EXPECTED_CONTEXT_INPUT_SIZE = 60

        rumor_classifier = RumorTweetsClassifer(
            word_embeddings,
            None,
            None,
            None,
            classifier_feedforward=None,
            cxt_content_encoder=None,
            cxt_metadata_encoder=None,
            social_context_self_attention_encoder=None,
            cuda_device=-1)

        tweet_id = "500327120770301952"
        single_source_tweet_tensor_1 = self.tweet_context_encoding_by_tweet_id(
            rumor_classifier, tweet_id)
        print(type(single_source_tweet_tensor_1))
        print(single_source_tweet_tensor_1.shape)
        assert type(single_source_tweet_tensor_1) == torch.Tensor
        assert single_source_tweet_tensor_1.shape == (
            97, EXPECTED_CONTEXT_INPUT_SIZE
        ), "expected shape is [19, %s]" % EXPECTED_CONTEXT_INPUT_SIZE

        tweet_id = "552806117328568321"  # with three replies
        single_source_tweet_tensor_2 = self.tweet_context_encoding_by_tweet_id(
            rumor_classifier, tweet_id)
        print(type(single_source_tweet_tensor_2))
        print(single_source_tweet_tensor_2.shape)
        assert type(single_source_tweet_tensor_2) == torch.Tensor
        assert single_source_tweet_tensor_2.shape == (
            94, EXPECTED_CONTEXT_INPUT_SIZE
        ), "expected shape is [3, %s]" % EXPECTED_CONTEXT_INPUT_SIZE

        tweet_id = "552806117328568321"  # with three replies
        print("social context encoding without numerical feature .")
        single_source_tweet_tensor_2 = self.tweet_context_encoding_by_tweet_id(
            rumor_classifier, tweet_id, disable_nf=True)
        print(type(single_source_tweet_tensor_2))
        print(single_source_tweet_tensor_2.shape)
        assert type(single_source_tweet_tensor_2) == torch.Tensor
        assert single_source_tweet_tensor_2.shape == (
            94, EXPECTED_CONTEXT_INPUT_SIZE
        ), "expected shape is [3, %s]" % EXPECTED_CONTEXT_INPUT_SIZE
def predict(vocab2):
	bert_token_indexer = PretrainedBertIndexer(
	    pretrained_model="bert-large-uncased",
	    max_pieces=config.max_seq_len,
	    do_lowercase=True,
	)
	reader = BertAnalogyDatasetReader(
		tokenizer=bert_tokenizer, 
		token_indexers={'tokens':bert_token_indexer}
	)	

	train_dataset, test_dataset, dev_dataset = (reader.read(DATA_ROOT + "/" + fname) for fname in ["train_all.txt", "test_all.txt", "val_all.txt"])

	bert_embedder = PretrainedBertEmbedder(
	         pretrained_model='bert-large-uncased',
	         top_layer_only=True, # conserve memory
	)
	word_embeddings: TextFieldEmbedder = BasicTextFieldEmbedder({"tokens": bert_embedder},
	                                                             # we'll be ignoring masks so we'll need to set this to True
	                                                            allow_unmatched_keys = True)

	BERT_DIM = word_embeddings.get_output_dim()
	class BertSentencePooler(Seq2VecEncoder):
	    def forward(self, embs: torch.tensor, 
	                mask: torch.tensor=None) -> torch.tensor:
	        # extract first token tensor
	        return embs[:, 0]
	    
	    @overrides
	    def get_output_dim(self) -> int:
	        return BERT_DIM
	        
	# if not vocab2: 
	# 	vocab2 = Vocabulary.from_files("./bert_vocabulary")

	bert_encoder = BertSentencePooler(vocab2)
	model2 = LstmModel(word_embeddings, bert_encoder, vocab2)
	if USE_GPU: model2.cuda()
	else: model2

	with open("./bert_model.th", 'rb') as f:
		model2.load_state_dict(torch.load(f))
	
	predictor2 = SentenceClassifierPredictor(model2, dataset_reader=reader)
	with open('bert_predictions.txt', 'w+') as f:
		top_10_words_list = []
		for analogy_test in test_dataset:
			logits = predictor2.predict_instance(analogy_test)['logits']
			label_id = np.argmax(logits)
			label_predict = model2.vocab.get_token_from_index(label_id, 'labels')

			top_10_ids = np.argsort(logits)[-10:]
			top_10_words = [model2.vocab.get_token_from_index(id, 'labels') for id in top_10_ids]
			top_10_words_list.append(top_10_words)
			f.write(label_predict + "\n")

	top_10_words_list = np.array(top_10_words_list)
	print(top_10_words_list.shape)
	np.save('bert_top_10_words_list.npy', np.array(top_10_words_list))
    def _find_model_function(self):
        embedding_dim = self.configuration['embed_size']
        embedding_matrix_filepath = self.base_data_dir + 'embedding_matrix'
        if os.path.exists(embedding_matrix_filepath):
            embedding_matrix = super()._load_object(embedding_matrix_filepath)
        else:
            embedding_filepath = self.configuration['embedding_filepath']
            embedding_matrix = embedding._read_embeddings_from_text_file(
                embedding_filepath,
                embedding_dim,
                self.vocab,
                namespace='tokens')
            super()._save_object(embedding_matrix_filepath, embedding_matrix)
        token_embedding = Embedding(
            num_embeddings=self.vocab.get_vocab_size(namespace='tokens'),
            embedding_dim=embedding_dim,
            padding_index=0,
            vocab_namespace='tokens',
            trainable=False,
            weight=embedding_matrix)
        # the embedder maps the input tokens to the appropriate embedding matrix
        word_embedder: TextFieldEmbedder = BasicTextFieldEmbedder(
            {"tokens": token_embedding})

        position_embedding = Embedding(
            num_embeddings=self.vocab.get_vocab_size(namespace='position'),
            embedding_dim=25,
            padding_index=0)
        position_embedder: TextFieldEmbedder = BasicTextFieldEmbedder(
            {"position": position_embedding},
            # we'll be ignoring masks so we'll need to set this to True
            allow_unmatched_keys=True)

        bert_word_embedder = self._get_bert_word_embedder()

        model = pytorch_models.ConstituencyBert(
            word_embedder,
            position_embedder,
            self.distinct_categories,
            self.distinct_polarities,
            self.vocab,
            self.configuration,
            bert_word_embedder=bert_word_embedder)
        self._print_args(model)
        model = model.to(self.configuration['device'])
        return model