예제 #1
0
    def from_params(cls, vocab: Vocabulary, params: Params) -> 'SpanConstituencyParser':
        embedder_params = params.pop("text_field_embedder")
        text_field_embedder = TextFieldEmbedder.from_params(vocab, embedder_params)
        span_extractor = SpanExtractor.from_params(params.pop("span_extractor"))
        encoder = Seq2SeqEncoder.from_params(params.pop("encoder"))

        feed_forward_params = params.pop("feedforward", None)
        if feed_forward_params is not None:
            feedforward_layer = FeedForward.from_params(feed_forward_params)
        else:
            feedforward_layer = None
        pos_tag_embedding_params = params.pop("pos_tag_embedding", None)
        if pos_tag_embedding_params is not None:
            pos_tag_embedding = Embedding.from_params(vocab, pos_tag_embedding_params)
        else:
            pos_tag_embedding = None
        initializer = InitializerApplicator.from_params(params.pop('initializer', []))
        regularizer = RegularizerApplicator.from_params(params.pop('regularizer', []))
        evalb_directory_path = params.pop("evalb_directory_path", None)
        params.assert_empty(cls.__name__)

        return cls(vocab=vocab,
                   text_field_embedder=text_field_embedder,
                   span_extractor=span_extractor,
                   encoder=encoder,
                   feedforward_layer=feedforward_layer,
                   pos_tag_embedding=pos_tag_embedding,
                   initializer=initializer,
                   regularizer=regularizer,
                   evalb_directory_path=evalb_directory_path)
예제 #2
0
 def setUp(self):
     super(TestTokenCharactersEncoder, self).setUp()
     self.vocab = Vocabulary()
     self.vocab.add_token_to_namespace("1", "token_characters")
     self.vocab.add_token_to_namespace("2", "token_characters")
     self.vocab.add_token_to_namespace("3", "token_characters")
     self.vocab.add_token_to_namespace("4", "token_characters")
     params = Params({
         "embedding": {
             "embedding_dim": 2,
             "vocab_namespace": "token_characters"
         },
         "encoder": {
             "type": "cnn",
             "embedding_dim": 2,
             "num_filters": 4,
             "ngram_filter_sizes": [1, 2],
             "output_dim": 3
         }
     })
     self.encoder = TokenCharactersEncoder.from_params(
         self.vocab, deepcopy(params))
     self.embedding = Embedding.from_params(self.vocab, params["embedding"])
     self.inner_encoder = Seq2VecEncoder.from_params(params["encoder"])
     constant_init = lambda tensor: torch.nn.init.constant(tensor, 1.)
     initializer = InitializerApplicator(default_initializer=constant_init)
     initializer(self.encoder)
     initializer(self.embedding)
     initializer(self.inner_encoder)
 def setUp(self):
     super(TestNgramWordsEncoder, self).setUp()
     self.vocab = Vocabulary()
     self.vocab.add_token_to_namespace("i", "ngram_words")
     self.vocab.add_token_to_namespace("go", "ngram_words")
     self.vocab.add_token_to_namespace("to", "ngram_words")
     self.vocab.add_token_to_namespace("school", "ngram_words")
     params = Params({
         "embedding": {
             "embedding_dim": 2,
             "vocab_namespace": "ngram_words"
         },
         "encoder": {
             "type": "cnn",
             "embedding_dim": 2,
             "num_filters": 4,
             "ngram_filter_sizes": [1, 2],
             "output_dim": 3
         }
     })
     self.encoder = NgramWordsEncoder.from_params(self.vocab,
                                                  deepcopy(params))
     self.embedding = Embedding.from_params(self.vocab, params["embedding"])
     self.inner_encoder = Seq2VecEncoder.from_params(params["encoder"])
     constant_init = lambda tensor: torch.nn.init.constant(tensor, 1.)
     initializer = InitializerApplicator([(".*", constant_init)])
     initializer(self.encoder)
     initializer(self.embedding)
     initializer(self.inner_encoder)
 def setUp(self):
     super(TestTokenCharactersEncoder, self).setUp()
     self.vocab = Vocabulary()
     self.vocab.add_token_to_namespace(u"1", u"token_characters")
     self.vocab.add_token_to_namespace(u"2", u"token_characters")
     self.vocab.add_token_to_namespace(u"3", u"token_characters")
     self.vocab.add_token_to_namespace(u"4", u"token_characters")
     params = Params({
         u"embedding": {
             u"embedding_dim": 2,
             u"vocab_namespace": u"token_characters"
         },
         u"encoder": {
             u"type": u"cnn",
             u"embedding_dim": 2,
             u"num_filters": 4,
             u"ngram_filter_sizes": [1, 2],
             u"output_dim": 3
         }
     })
     self.encoder = TokenCharactersEncoder.from_params(
         vocab=self.vocab, params=deepcopy(params))
     self.embedding = Embedding.from_params(vocab=self.vocab,
                                            params=params[u"embedding"])
     self.inner_encoder = Seq2VecEncoder.from_params(params[u"encoder"])
     constant_init = lambda tensor: torch.nn.init.constant_(tensor, 1.)
     initializer = InitializerApplicator([(u".*", constant_init)])
     initializer(self.encoder)
     initializer(self.embedding)
     initializer(self.inner_encoder)
 def setUp(self):
     super().setUp()
     self.vocab = Vocabulary()
     self.vocab.add_token_to_namespace("1", "token_characters")
     self.vocab.add_token_to_namespace("2", "token_characters")
     self.vocab.add_token_to_namespace("3", "token_characters")
     self.vocab.add_token_to_namespace("4", "token_characters")
     params = Params({
         "embedding": {
             "embedding_dim": 2,
             "vocab_namespace": "token_characters"
         },
         "encoder": {
             "type": "cnn",
             "embedding_dim": 2,
             "num_filters": 4,
             "ngram_filter_sizes": [1, 2],
             "output_dim": 3
         }
     })
     self.encoder = TokenCharactersEncoder.from_params(
         vocab=self.vocab, params=deepcopy(params))
     self.embedding = Embedding.from_params(vocab=self.vocab,
                                            params=params["embedding"])
     self.inner_encoder = Seq2VecEncoder.from_params(params["encoder"])
     constant_init = Initializer.from_params(
         Params({
             "type": "constant",
             "val": 1.
         }))
     initializer = InitializerApplicator([(".*", constant_init)])
     initializer(self.encoder)
     initializer(self.embedding)
     initializer(self.inner_encoder)
    def from_params(cls, vocab: Vocabulary, params: Params) -> 'SpanConstituencyParser':
        embedder_params = params.pop("text_field_embedder")
        text_field_embedder = TextFieldEmbedder.from_params(vocab, embedder_params)
        span_extractor = SpanExtractor.from_params(params.pop("span_extractor"))
        encoder = Seq2SeqEncoder.from_params(params.pop("encoder"))

        feed_forward_params = params.pop("feedforward", None)
        if feed_forward_params is not None:
            feedforward_layer = FeedForward.from_params(feed_forward_params)
        else:
            feedforward_layer = None
        pos_tag_embedding_params = params.pop("pos_tag_embedding", None)
        if pos_tag_embedding_params is not None:
            pos_tag_embedding = Embedding.from_params(vocab, pos_tag_embedding_params)
        else:
            pos_tag_embedding = None
        initializer = InitializerApplicator.from_params(params.pop('initializer', []))
        regularizer = RegularizerApplicator.from_params(params.pop('regularizer', []))
        evalb_directory_path = params.pop("evalb_directory_path", None)
        params.assert_empty(cls.__name__)

        return cls(vocab=vocab,
                   text_field_embedder=text_field_embedder,
                   span_extractor=span_extractor,
                   encoder=encoder,
                   feedforward_layer=feedforward_layer,
                   pos_tag_embedding=pos_tag_embedding,
                   initializer=initializer,
                   regularizer=regularizer,
                   evalb_directory_path=evalb_directory_path)
 def setUp(self):
     super(TestTokenCharactersEncoder, self).setUp()
     self.vocab = Vocabulary()
     self.vocab.add_token_to_namespace("1", "token_characters")
     self.vocab.add_token_to_namespace("2", "token_characters")
     self.vocab.add_token_to_namespace("3", "token_characters")
     self.vocab.add_token_to_namespace("4", "token_characters")
     params = Params({
             "embedding": {
                     "embedding_dim": 2,
                     "vocab_namespace": "token_characters"
                     },
             "encoder": {
                     "type": "cnn",
                     "embedding_dim": 2,
                     "num_filters": 4,
                     "ngram_filter_sizes": [1, 2],
                     "output_dim": 3
                     }
             })
     self.encoder = TokenCharactersEncoder.from_params(vocab=self.vocab, params=deepcopy(params))
     self.embedding = Embedding.from_params(vocab=self.vocab, params=params["embedding"])
     self.inner_encoder = Seq2VecEncoder.from_params(params["encoder"])
     constant_init = Initializer.from_params(Params({"type": "constant", "val": 1.}))
     initializer = InitializerApplicator([(".*", constant_init)])
     initializer(self.encoder)
     initializer(self.embedding)
     initializer(self.inner_encoder)
예제 #8
0
    def __init__(self,
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None,
                 inp_dim,
                 hid_dim,
                 dropout: float = 0.4,
                 dropout_emb: float = 0.2,
                 pretrain_embedding_file=None,
                 gather='sum'):
        super(EncDoc, self).__init__(vocab, regularizer)
        token_embedding = Embedding(
            num_embeddings=vocab.get_vocab_size('tokens'),
            embedding_dim=inp_dim)

        if dropout_emb > 0:
            self._lexical_dropout = torch.nn.Dropout(p=dropout_emb)
        else:
            self._lexical_dropout = lambda x: x

        self.hid_dim = hid_dim
        self.sent_enc = EncWord2Sent(inp_dim=inp_dim,
                                     hid_dim=hid_dim,
                                     dropout=dropout,
                                     gather=gather)

        if pretrain_embedding_file is not None:
            logger = logging.getLogger()
            logger.info(
                "Loading word embedding: {}".format(pretrain_embedding_file))
            token_embedding.from_params(vocab=vocab,
                                        params=Params({
                                            "pretrained_file":
                                            pretrain_embedding_file,
                                            "embedding_dim": inp_dim
                                        }))
            print("token_embedding size: {}".format(
                token_embedding.num_embeddings))
        self._text_field_embedder = BasicTextFieldEmbedder(
            {"tokens": token_embedding})

        self.sent2doc = EncWord2Sent(inp_dim=self.sent_enc.get_output_dim(),
                                     hid_dim=hid_dim,
                                     nenc_lay=1,
                                     dropout=dropout)
예제 #9
0
    def __init__(self,
                 device,
                 inp_dim,
                 hid_dim,
                 compression,
                 vocab,
                 dropout: float = 0.4,
                 dropout_emb: float = 0.2,
                 pretrain_embedding_file=None):
        super().__init__()
        self.compression = compression
        self.hid_dim = hid_dim
        self.sent_enc = EncSent(device=device,
                                inp_dim=inp_dim,
                                hid_dim=hid_dim,
                                compression=compression)
        token_embedding = Embedding(
            num_embeddings=vocab.get_vocab_size('tokens'),
            embedding_dim=inp_dim)

        if dropout_emb > 0:
            self._lexical_dropout = torch.nn.Dropout(p=dropout_emb)
        else:
            self._lexical_dropout = lambda x: x

        if pretrain_embedding_file is not None:
            logger = logging.getLogger()
            logger.info(
                "Loading word embedding: {}".format(pretrain_embedding_file))
            token_embedding.from_params(vocab=vocab,
                                        params=Params({
                                            "pretrained_file":
                                            pretrain_embedding_file,
                                            "embedding_dim": inp_dim
                                        }))
        self._text_field_embedder = BasicTextFieldEmbedder(
            {"tokens": token_embedding})

        self.sent2doc = EncWord2Sent(device=device,
                                     inp_dim=self.sent_enc.get_output_dim(),
                                     hidden_dim=hid_dim,
                                     nenc_lay=2,
                                     dropout=dropout)
예제 #10
0
def glove_embeddings(vocab: Vocabulary, file_path: Path, dimension: int,
                     training: bool = True, namespace: str = 'tokens'
                     ) -> BasicTextFieldEmbedder:
    "Pre-trained embeddings using GloVe"
    token_embedding = Embedding.from_params(vocab, Params({
        "embedding_dim": dimension,
        "vocab_namespace": 'tokens',
        "pretrained_file": str(file_path),
        "trainable": training,
    }))
    word_embeddings = BasicTextFieldEmbedder({namespace: token_embedding})
    return word_embeddings
예제 #11
0
 def from_params(cls, vocab: Vocabulary,
                 params: Params) -> 'SyllableEmbedder':  # type: ignore
     # pylint: disable=arguments-differ
     embedding_params: Params = params.pop("syllable_embedding")
     # Embedding.from_params() uses "tokens" as the default namespace, but we need to change
     # that to be "token_characters" by default. If num_embeddings is present, set default namespace
     # to None so that extend_vocab call doesn't misinterpret that some namespace was originally used.
     default_namespace = None if embedding_params.get(
         "num_embeddings", None) else "token_characters"
     embedding_params.setdefault("vocab_namespace", default_namespace)
     embedding = Embedding.from_params(vocab, embedding_params)
     encoder_params: Params = params.pop("syllable_encoder")
     encoder = Seq2VecEncoder.from_params(encoder_params)
     dropout = params.pop_float("dropout", 0.0)
     params.assert_empty(cls.__name__)
     return cls(embedding, encoder, dropout)
예제 #12
0
파일: encoder.py 프로젝트: blalalt/bert
 def __init__(self, embedding_file, vocab):
     super(GloveEncoder, self).__init__()
     _out_dim = 100
     self.token_embedding = Embedding.from_params(
         vocab=vocab,
         params=Params({'pretrained_file': embedding_file,
                        'embedding_dim': GLOVE_EMBEDDING_DIM})
     )
     self.embed = BasicTextFieldEmbedder({"tokens": self.token_embedding})
     self.encoder = PytorchSeq2SeqWrapper(torch.nn.LSTM(
         batch_first=True,
         bidirectional=True,
         input_size=GLOVE_EMBEDDING_DIM,
         hidden_size=_out_dim
     ))
     self._dropout = torch.nn.Dropout(0.5)
예제 #13
0
    def __init__(self, params: Params, vocab: Vocabulary) -> None:
        super().__init__(vocab=vocab)

        enc_hidden_dim = params.pop_int('enc_hidden_dim', 300)
        disc_hidden_dim = params.pop_int('disc_hidden_dim', 1200)
        disc_num_layers = params.pop_int('disc_num_layers', 1)

        emb_dropout = params.pop_float('emb_dropout', 0.0)
        disc_dropout = params.pop_float('disc_dropout', 0.0)
        l2_weight = params.pop_float('l2_weight', 0.0)

        self.emb_dropout = nn.Dropout(emb_dropout)
        self.disc_dropout = nn.Dropout(disc_dropout)
        self._l2_weight = l2_weight

        self._token_embedder = Embedding.from_params(
            vocab=vocab, params=params.pop('token_embedder'))
        self._discriminator_encoder = PytorchSeq2VecWrapper(
            nn.LSTM(input_size=self._token_embedder.get_output_dim(),
                    hidden_size=enc_hidden_dim,
                    batch_first=True))
        self._discriminator = FeedForward(
            input_dim=4 * self._discriminator_encoder.get_output_dim(),
            hidden_dims=[disc_hidden_dim] * disc_num_layers +
            [self._NUM_LABELS],
            num_layers=disc_num_layers + 1,
            activations=[Activation.by_name('relu')()] * disc_num_layers +
            [Activation.by_name('linear')()])

        # Metrics
        self._metrics = {
            'labeled': {
                'discriminator_entropy': ScalarMetric(),
                'discriminator_accuracy': CategoricalAccuracy(),
                'loss': ScalarMetric()
            }
        }
예제 #14
0
def construct_model(vocab, args):
    # token embedding

    word_embedding = Embedding.from_params(vocab=vocab, params=Params({
        "pretrained_file": "glove\\glove.vocab.100d.txt",
        "embedding_dim": 100,
        "trainable": True,
        "padding_index": 0
    }))

    word_embedding = BasicTextFieldEmbedder({
        "token_words": word_embedding
    })

    char_embedding = BasicTextFieldEmbedder({
        "token_characters": TokenCharactersEncoder(embedding=Embedding(embedding_dim=20,
                                                                       num_embeddings=262),
                                                   encoder=CnnEncoder(embedding_dim=20,
                                                                      ngram_filter_sizes=[5],
                                                                      num_filters=50)),
    })

    lstm = PytorchSeq2SeqWrapper(
        torch.nn.LSTM(input_size=100,
                      num_layers=1,
                      hidden_size=100,
                      bidirectional=True,
                      batch_first=True))

    model = FollowUpSnippetModel(vocab=vocab,
                                 word_embedder=word_embedding,
                                 char_embedder=char_embedding,
                                 tokens_encoder=lstm,
                                 model_args=args)

    return model
예제 #15
0
    def from_params(cls,
                    vocab: Vocabulary,
                    params: Params,
                    constructor_to_call=None,
                    constructor_to_inspect=None) -> 'ProLocalModel':
        embedder_params = params.pop("text_field_embedder")
        token_params = embedder_params.pop("tokens")
        embedding = Embedding.from_params(vocab=vocab, params=token_params)
        text_field_embedder = BasicTextFieldEmbedder(
            token_embedders={'tokens': embedding})
        #         text_field_embedder = TextFieldEmbedder.from_params(vocab, embedder_params)

        seq2seq_encoder_params = params.pop("seq2seq_encoder")
        seq2seq_encoder = Seq2SeqEncoder.from_params(seq2seq_encoder_params)

        initializer = InitializerApplicator(
        )  #.from_params(params.pop("initializer", []))

        params.assert_empty(cls.__name__)
        #         print(cls)
        return cls(vocab=vocab,
                   text_field_embedder=text_field_embedder,
                   seq2seq_encoder=seq2seq_encoder,
                   initializer=initializer)
예제 #16
0
def load_decomposable_attention_elmo_softmax_model():
    NEGATIVE_PERCENTAGE = 100
    # EMBEDDING_TYPE = ""
    # LOSS_TYPE = ""				# NLL
    # LOSS_TYPE = "_nll"				# NLL
    LOSS_TYPE = "_mse"  # MSE
    # EMBEDDING_TYPE = ""
    # EMBEDDING_TYPE = "_glove"
    # EMBEDDING_TYPE = "_bert"
    EMBEDDING_TYPE = "_elmo"
    # EMBEDDING_TYPE = "_elmo_retrained"
    # EMBEDDING_TYPE = "_elmo_retrained_2"
    token_indexers = None
    if EMBEDDING_TYPE == "_elmo" or EMBEDDING_TYPE == "_elmo_retrained" or EMBEDDING_TYPE == "_elmo_retrained_2":
        token_indexers = {"tokens": ELMoTokenCharactersIndexer()}
    MAX_BATCH_SIZE = 0
    # MAX_BATCH_SIZE = 150 # for bert and elmo
    reader = QuestionResponseSoftmaxReader(token_indexers=token_indexers,
                                           max_batch_size=MAX_BATCH_SIZE)
    model_file = os.path.join(
        "saved_softmax_models",
        "decomposable_attention{}{}_model_{}.th".format(
            LOSS_TYPE, EMBEDDING_TYPE, NEGATIVE_PERCENTAGE))

    vocabulary_filepath = os.path.join(
        "saved_softmax_models",
        "vocabulary{}{}_{}".format(LOSS_TYPE, EMBEDDING_TYPE,
                                   NEGATIVE_PERCENTAGE))
    print("LOADING VOCABULARY")
    # Load vocabulary
    vocab = Vocabulary.from_files(vocabulary_filepath)

    EMBEDDING_DIM = 300
    PROJECT_DIM = 200
    DROPOUT = 0.2
    NUM_LAYERS = 2
    if EMBEDDING_TYPE == "":
        token_embedding = Embedding(
            num_embeddings=vocab.get_vocab_size('tokens'),
            embedding_dim=EMBEDDING_DIM,
            projection_dim=PROJECT_DIM)
    elif EMBEDDING_TYPE == "_glove":
        token_embedding = Embedding.from_params(vocab=vocab,
                                                params=Params({
                                                    'pretrained_file':
                                                    glove_embeddings_file,
                                                    'embedding_dim':
                                                    EMBEDDING_DIM,
                                                    'projection_dim':
                                                    PROJECT_DIM,
                                                    'trainable':
                                                    False
                                                }))
    elif EMBEDDING_TYPE == "_elmo":
        # options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_options.json"
        # weights_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5"
        options_file = os.path.join(
            "data", "elmo", "elmo_2x2048_256_2048cnn_1xhighway_options.json")
        weights_file = os.path.join(
            "data", "elmo", "elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5")
        # NOTE: using Small size as medium size gave CUDA out of memory error
        # options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json"
        # weights_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5"
        # options_file = os.path.join("data", "elmo", "elmo_2x1024_128_2048cnn_1xhighway_options.json")
        # weights_file = os.path.join("data", "elmo", "elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5")
        token_embedding = ElmoTokenEmbedder(options_file,
                                            weights_file,
                                            dropout=DROPOUT,
                                            projection_dim=PROJECT_DIM)
    elif EMBEDDING_TYPE == "_elmo_retrained":
        options_file = os.path.join("data", "bilm-tf", "elmo_retrained",
                                    "options.json")
        weights_file = os.path.join("data", "bilm-tf", "elmo_retrained",
                                    "weights.hdf5")
        token_embedding = ElmoTokenEmbedder(options_file,
                                            weights_file,
                                            dropout=DROPOUT,
                                            projection_dim=PROJECT_DIM)
    elif EMBEDDING_TYPE == "_elmo_retrained_2":
        options_file = os.path.join("data", "bilm-tf", "elmo_retrained",
                                    "options_2.json")
        weights_file = os.path.join("data", "bilm-tf", "elmo_retrained",
                                    "weights_2.hdf5")
        token_embedding = ElmoTokenEmbedder(options_file,
                                            weights_file,
                                            dropout=DROPOUT,
                                            projection_dim=PROJECT_DIM)
    elif EMBEDDING_TYPE == "_bert":
        print("Loading bert model")
        model = BertModel.from_pretrained('bert-base-uncased')
        token_embedding = BertEmbedder(model)
        PROJECT_DIM = 768
    else:
        print("Error: Some weird Embedding type", EMBEDDING_TYPE)
        exit()
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
    HIDDEN_DIM = 200
    params = Params({
        'input_dim': PROJECT_DIM,
        'hidden_dims': HIDDEN_DIM,
        'activations': 'relu',
        'num_layers': NUM_LAYERS,
        'dropout': DROPOUT
    })
    attend_feedforward = FeedForward.from_params(params)
    similarity_function = DotProductSimilarity()
    params = Params({
        'input_dim': 2 * PROJECT_DIM,
        'hidden_dims': HIDDEN_DIM,
        'activations': 'relu',
        'num_layers': NUM_LAYERS,
        'dropout': DROPOUT
    })
    compare_feedforward = FeedForward.from_params(params)
    params = Params({
        'input_dim': 2 * HIDDEN_DIM,
        'hidden_dims': 1,
        'activations': 'linear',
        'num_layers': 1
    })
    aggregate_feedforward = FeedForward.from_params(params)
    model = DecomposableAttentionSoftmax(vocab, word_embeddings,
                                         attend_feedforward,
                                         similarity_function,
                                         compare_feedforward,
                                         aggregate_feedforward)
    print("MODEL CREATED")
    # Load model state
    with open(model_file, 'rb') as f:
        model.load_state_dict(torch.load(f, map_location='cuda:0'))
    print("MODEL LOADED!")
    if torch.cuda.is_available():
        # cuda_device = 3
        # model = model.cuda(cuda_device)
        cuda_device = -1
    else:
        cuda_device = -1

    predictor = DecomposableAttentionSoftmaxPredictor(model,
                                                      dataset_reader=reader)
    return model, predictor
예제 #17
0
    def embeddings_returner(self, vocab=None):
        '''
        Either the name of the pretrained model to use (e.g. bert-base-uncased),or the path to the .tar.gz
        file with the model weights.
        :param args: vocab_size and vocab is needed only when pretrained embeddings is used.
        :return: embedder
        '''
        '''
        "bert-base-uncased", do_lower_case=True
        "bert-base-cased" , do_lower_case=False
        https://github.com/huggingface/pytorch-transformers/issues/712
        https://qiita.com/uedake722/items/b7f4b75b4d77d9bd358b
        '''
        if self.embedding_strategy == 'bert':
            self.bertmodel_dir = ''
            if self.ifbert_use_whichmodel == 'general':
                self.bertmodel_dir += 'bert-base-uncased/'  # recomendded ver is uncased, in original repository
                self.bertmodel_relative_dirpath = self.bert_src_dir + self.bertmodel_dir

                # included in pytorch_transformers, so we replace it with model name itself
                self.bert_weight_filepath = copy.copy('bert-base-uncased')

            elif self.ifbert_use_whichmodel == 'scibert':
                self.bertmodel_dir += 'scibert_scivocab_uncased/'  # recomendded ver is uncased, in original repository
                self.bertmodel_relative_dirpath = self.bert_src_dir + self.bertmodel_dir
                self.bert_weight_filepath = self.bertmodel_relative_dirpath + 'weights.tar.gz'

            elif self.ifbert_use_whichmodel == 'biobert':
                self.bertmodel_dir += 'biobert_v1.1_pubmed/'  # currently cased version only supported
                self.bertmodel_relative_dirpath = self.bert_src_dir + self.bertmodel_dir
                self.bert_weight_filepath = self.bertmodel_relative_dirpath + 'weights.tar.gz'  # including bert_config.json and bin.

            # Load embedder
            bert_embedder = PretrainedBertEmbedder(
                pretrained_model=self.bert_weight_filepath,
                top_layer_only=self.bert_top_layer_only,
                requires_grad=self.emb_requires_grad)
            return bert_embedder, bert_embedder.get_output_dim(
            ), BasicTextFieldEmbedder({'tokens': bert_embedder},
                                      allow_unmatched_keys=True)

        elif self.embedding_strategy == 'elmo':
            if self.ifelmo_use_whichmodel == 'general':
                options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json'
                weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5'
            elif self.ifelmo_use_whichmodel == 'pubmed':
                options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/contributed/pubmed/elmo_2x4096_512_2048cnn_2xhighway_options.json'
                weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/contributed/pubmed/elmo_2x4096_512_2048cnn_2xhighway_weights_PubMed_only.hdf5'
            elif self.ifelmo_use_whichmodel == 'bioelmo':
                options_file = self.elmo_src_dir + 'BioELMo/weights/biomed_elmo_options.json'
                weight_file = self.elmo_src_dir + 'BioELMo/weights/biomed_elmo_weights.hdf5'
            else:
                options_file = -1
                weight_file = -1
            assert options_file != -1
            elmo_embedder = ElmoTokenEmbedder(
                options_file=options_file,
                weight_file=weight_file,
                requires_grad=self.emb_requires_grad)
            return elmo_embedder, elmo_embedder.get_output_dim(
            ), BasicTextFieldEmbedder({'tokens': elmo_embedder})

        elif self.embedding_strategy == 'pretrained':

            print('\nGloVe pretrained vocab loading\n')

            if 'glove' in self.args.ifpretrained_use_whichmodel:
                embedding_dim = 300
            else:
                embedding_dim = 200

            pretrain_emb_embedder = Embedding.from_params(
                vocab=vocab,
                params=Params({
                    'pretrained_file': self.glove_embeddings_file,
                    'embedding_dim': embedding_dim,
                    'trainable': False,
                    'padding_index': 0
                }))

            return pretrain_emb_embedder, pretrain_emb_embedder.get_output_dim(
            ), BasicTextFieldEmbedder({'tokens': pretrain_emb_embedder})
예제 #18
0
    def __init__(self, params: Params, vocab: Vocabulary) -> None:
        super().__init__(vocab=vocab)

        disc_hidden_dim = params.pop_int('disc_hidden_dim', 1200)
        disc_num_layers = params.pop_int('disc_num_layers', 1)
        code_dist_type = params.pop_choice('code_dist_type',
                                           ['gaussian', 'vmf'],
                                           default_to_first_choice=True)
        code_dim = params.pop_int('code_dim', 500)

        emb_dropout = params.pop_float('emb_dropout', 0.0)
        disc_dropout = params.pop_float('disc_dropout', 0.0)
        latent_dropout = params.pop_float('latent_dropout', 0.0)
        l2_weight = params.pop_float('l2_weight', 0.0)

        self.emb_dropout = nn.Dropout(emb_dropout)
        self.disc_dropout = nn.Dropout(disc_dropout)
        self.latent_dropout = nn.Dropout(latent_dropout)
        self._l2_weight = l2_weight

        self._token_embedder = Embedding.from_params(
            vocab=vocab, params=params.pop('token_embedder'))
        self._encoder = nn.Sequential(
            nn.Conv1d(in_channels=300,
                      out_channels=300,
                      kernel_size=5,
                      stride=2),
            nn.Conv1d(in_channels=300,
                      out_channels=600,
                      kernel_size=5,
                      stride=2),
            nn.Conv1d(in_channels=600,
                      out_channels=500,
                      kernel_size=5,
                      stride=2))
        self._generator = nn.Sequential(
            nn.ConvTranspose1d(in_channels=500,
                               out_channels=600,
                               kernel_size=5,
                               stride=2), nn.ReLU(),
            nn.ConvTranspose1d(in_channels=600,
                               out_channels=300,
                               kernel_size=5,
                               stride=2), nn.ReLU(),
            nn.ConvTranspose1d(in_channels=300,
                               out_channels=300,
                               kernel_size=5,
                               stride=2), nn.ReLU())
        self._generator_projector = nn.Linear(
            in_features=300, out_features=vocab.get_vocab_size(), bias=False)
        self._generator_projector.weight = self._token_embedder.weight

        if code_dist_type == 'vmf':
            vmf_kappa = params.pop_int('vmf_kappa', 150)
            self._code_generator = VmfCodeGenerator(input_dim=500,
                                                    code_dim=code_dim,
                                                    kappa=vmf_kappa)
        elif code_dist_type == 'gaussian':
            self._code_generator = GaussianCodeGenerator(input_dim=500,
                                                         code_dim=code_dim)
        else:
            raise ValueError('Unknown code_dist_type')

        self._discriminator = FeedForward(
            input_dim=4 * self._code_generator.get_output_dim(),
            hidden_dims=[disc_hidden_dim] * disc_num_layers +
            [self._NUM_LABELS],
            num_layers=disc_num_layers + 1,
            activations=[Activation.by_name('relu')()] * disc_num_layers +
            [Activation.by_name('linear')()],
            dropout=disc_dropout)

        self._kl_weight = 1.0
        self._discriminator_weight = params.pop_float('discriminator_weight',
                                                      0.1)
        self._gumbel_temperature = 1.0

        # Metrics
        self._metrics = {
            'generator_loss': ScalarMetric(),
            'kl_divergence': ScalarMetric(),
            'discriminator_accuracy': CategoricalAccuracy(),
            'discriminator_loss': ScalarMetric(),
            'loss': ScalarMetric()
        }
예제 #19
0
파일: train.py 프로젝트: vrupp/teaching
    "pre_trained_embedding": "../data/glove.42B.300d.txt",
    "model": "knrm",
    "train_data": "../data/triples.train.tsv",
    "validation_data": "../data/tuples.validation.tsv",
    "test_data": "../data/tuples.test.tsv",
}

#
# data loading
#

vocab = Vocabulary.from_files(config["vocab_directory"])
tokens_embedder = Embedding.from_params(
    vocab,
    Params({
        "pretrained_file": config["pre_trained_embedding"],
        "embedding_dim": 300,
        "trainable": True,
        "padding_index": 0
    }))

word_embedder = BasicTextFieldEmbedder({"tokens": tokens_embedder})

# recommended default params for the models (but you may change them if you want)
if config["model"] == "knrm":
    model = KNRM(word_embedder, n_kernels=11)
elif config["model"] == "conv_knrm":
    model = Conv_KNRM(word_embedder, n_grams=3, n_kernels=11, conv_out_dim=128)
elif config["model"] == "match_pyramid":
    model = MatchPyramid(word_embedder,
                         conv_output_size=[16, 16, 16, 16, 16],
                         conv_kernel_size=[[3, 3], [3, 3], [3, 3], [3, 3],
예제 #20
0
파일: text.py 프로젝트: xbraininc/viraal
 def instantiate_word_embedding(self):
     embedding_params = Params(OmegaConf.to_container(self.c.dataset.embedding))
     token_embedding = Embedding.from_params(self.vocab, embedding_params)
     self.word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
예제 #21
0
    def train(self, args_hpo, index):
        """
        trains the model, and return the metrics to the meta optimizer.
        :param args_hpo:
        :param index:
        :return:
        """
        PrintColors.prYellow('\n===== training with: {}'.format(args_hpo))
        PrintColors.prGreen('----- in {} mode -----'.format('train'))
        ''' ============ LOAD DATA ================================================================================ '''
        starting_time = time.time()
        lm_dataset_reader = LanguageModelSegmentReader(global_constants=GLOBAL_CONSTANTS)
        train_data, val_data = (lm_dataset_reader.read(folder) for folder in
                                [_train_data_path, _val_data_path])
        lm_vocabulary = Vocabulary.from_instances(train_data + val_data)
        iterator = BasicIterator(batch_size=args_hpo.batch_size)
        iterator.index_with(lm_vocabulary)
        ''' ============ DEFINE MODEL ============================================================================= '''
        ''' 
        the class params 'pop' its parameters i.e. they disappear after first use. So we instantiate a Params 
        instance for each model defining execution. More than that, they turn dicts into Mutable mappings and 
        destroys the original dict. So here's your copy allennlp. Thanks. (I still love you)
        '''
        token_embedding = Embedding.from_params(vocab=lm_vocabulary,
                                                params=Params(copy.deepcopy(GLOBAL_CONSTANTS.GLOVE_PARAMS_CONFIG)))

        token_embedder: TextFieldEmbedder = BasicTextFieldEmbedder({'tokens': token_embedding})
        ''' define encoder to wrap up an lstm feature extractor '''
        contextualizer: Seq2SeqEncoder = PytorchSeq2SeqWrapper(
            torch.nn.LSTM(input_size=args_hpo.word_embedding_size,
                          hidden_size=args_hpo.ed_ncoder_size,
                          bidirectional=False, batch_first=True))

        model = LanguageModel(vocab=lm_vocabulary,
                              text_field_embedder=token_embedder,
                              contextualizer=contextualizer,
                              dropout=args_hpo.dropout,
                              regularizer=RegularizerApplicator([('l2', L2Regularizer(alpha=args_hpo.l2))]),
                              )\
            .cuda(_device)

        ''' ============ TRAIN ================================================================================ '''
        '''  callbacks  '''
        if index == 0:
            for file in os.listdir(os.path.join(*['.', 'lm_models'])):
                path = os.path.join(*['.', 'lm_models', file])
                if os.path.isfile(path):
                    os.remove(path)
                else:
                    shutil.rmtree(path)
        serialization_path = 'models_lm_{}_{}'.format(_tag, index)
        serialization_path_longer = os.path.join(*['.', 'lm_models', serialization_path])
        vocab_path = 'vocab_lm_{}_{}'.format(_tag, index)
        vocab_dir_longer = os.path.join(*['.', 'lm_models', vocab_path])
        if not os.path.exists(serialization_path_longer):
            os.mkdir(serialization_path_longer)
        callbacks = list()
        ''' for validation '''
        callbacks.append(validate.Validate(validation_data=val_data, validation_iterator=iterator))
        ''' for early stopping. it tracks 'loss' returned by model.forward() '''
        callbacks.append(track_metrics.TrackMetrics(patience=3))
        ''' for grad clipping '''
        callbacks.append(gradient_norm_and_clip.GradientNormAndClip(grad_clipping=args_hpo.clip))
        ''' 
            for checkpointing
            TODO: NOTE:serialization path CANNOT exist before training ??
        '''
        model_checkpointer = checkpointer.Checkpointer(serialization_dir=serialization_path_longer,
                                                       num_serialized_models_to_keep=1)
        callbacks.append(checkpoint.Checkpoint(checkpointer=model_checkpointer))
        ''' for sample generations '''

        callback_trainer = CallbackTrainer(
            model=model,
            training_data=train_data,
            iterator=iterator,
            optimizer=torch.optim.Adam(model.parameters(), lr=args_hpo.lr),
            num_epochs=_n_epochs,
            serialization_dir=serialization_path_longer,
            cuda_device=_device,
            callbacks=callbacks
        )

        ''' trainer saves the model, but the vocabulary needs to be saved, too '''
        lm_vocabulary.save_to_files(vocab_dir_longer)

        ''' check the metric names to synchronize with the class '''
        metrics = callback_trainer.train()
        metrics['time_consumed(hrs)'] = round((time.time() - starting_time) / 3600, 4)

        return metrics
 EMBEDDING_DIM = 300
 PROJECT_DIM = 200
 DROPOUT = 0.2
 NUM_LAYERS = 2
 if EMBEDDING_TYPE == "":
     token_embedding = Embedding(
         num_embeddings=vocab.get_vocab_size('tokens'),
         embedding_dim=EMBEDDING_DIM,
         projection_dim=PROJECT_DIM)
 elif EMBEDDING_TYPE == "_glove":
     token_embedding = Embedding.from_params(vocab=vocab,
                                             params=Params({
                                                 'pretrained_file':
                                                 glove_embeddings_file,
                                                 'embedding_dim':
                                                 EMBEDDING_DIM,
                                                 'projection_dim':
                                                 PROJECT_DIM,
                                                 'trainable':
                                                 False
                                             }))
 elif EMBEDDING_TYPE == "_elmo":
     # options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_options.json"
     # weights_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5"
     options_file = os.path.join(
         "data", "elmo", "elmo_2x2048_256_2048cnn_1xhighway_options.json")
     weights_file = os.path.join(
         "data", "elmo", "elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5")
     # NOTE: using Small size as medium size gave CUDA out of memory error
     # options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json"
     # weights_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5"
def train(train_dataset, val_dataset, cfg):
    # Vocabularyを生成
    VOCAB_SIZE = cfg.w2v.vocab_size
    vocab = Vocabulary.from_instances(train_dataset + val_dataset,
                                      max_vocab_size=VOCAB_SIZE)

    BATCH_SIZE = cfg.training.batch_size

    # パディング済みミニバッチを生成してくれるIterator
    iterator = BucketIterator(batch_size=BATCH_SIZE,
                              sorting_keys=[("tokens", "num_tokens")])
    iterator.index_with(vocab)

    # 東北大が提供している学習済み日本語 Wikipedia エンティティベクトルを使用する
    # http://www.cl.ecei.tohoku.ac.jp/~m-suzuki/jawiki_vector/
    model_name = cfg.w2v.model_name
    norm = cfg.w2v.norm
    cwd = hydra.utils.get_original_cwd()
    params = Params({
        'embedding_dim':
        200,
        'padding_index':
        0,
        'pretrained_file':
        os.path.join(cwd, f'embs/jawiki.{model_name}_vectors.200d.txt'),
        'norm_type':
        norm
    })

    token_embedding = Embedding.from_params(vocab=vocab, params=params)
    HIDDEN_SIZE = cfg.model.hidden_size
    dropout = cfg.model.dropout

    word_embeddings: TextFieldEmbedder = BasicTextFieldEmbedder(
        {"tokens": token_embedding})
    encoder: Seq2SeqEncoder = PytorchSeq2SeqWrapper(
        nn.LSTM(word_embeddings.get_output_dim(),
                HIDDEN_SIZE,
                bidirectional=True,
                batch_first=True))
    model = ClassifierWithAttn(word_embeddings, encoder, vocab, dropout)
    model.train()

    USE_GPU = True

    if USE_GPU and torch.cuda.is_available():
        model = model.cuda(0)

    LR = cfg.training.learning_rate
    EPOCHS = cfg.training.epoch
    patience = cfg.training.patience if cfg.training.patience > 0 else None

    optimizer = optim.Adam(model.parameters(), lr=LR)
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=val_dataset,
                      patience=patience,
                      cuda_device=0 if USE_GPU else -1,
                      num_epochs=EPOCHS)
    metrics = trainer.train()
    logger.info(metrics)

    return model, metrics
            config["validation_end_candidate_set_from_to"][1])

    test_candidate_set = None
    if "test_candidate_set_max" in config and "test_candidate_set_path" in config:
        test_candidate_set = parse_candidate_set(
            config["test_candidate_set_path"],
            config["test_candidate_set_max"])

    # embedding layer (use pre-trained, but make it trainable as well)
    if config["token_embedder_type"] == "embedding":
        vocab = Vocabulary.from_files(config["vocab_directory"])
        tokens_embedder = Embedding.from_params(
            vocab,
            Params({
                "pretrained_file": config["pre_trained_embedding"],
                "embedding_dim": config["pre_trained_embedding_dim"],
                "trainable": config["train_embedding"],
                "padding_index": 0,
                "sparse": config["sparse_gradient_embedding"]
            }))
    elif config["token_embedder_type"] == "fasttext":
        vocab = None  #FastTextVocab(config["fasttext_vocab_mapping"])
        tokens_embedder = FastTextEmbeddingBag(numpy.load(
            config["fasttext_weights"]),
                                               sparse=True)

    elif config["token_embedder_type"] == "elmo":
        vocab = None
        tokens_embedder = ElmoTokenEmbedder(config["elmo_options_file"],
                                            config["elmo_weights_file"])
    else:
예제 #25
0
iterator.index_with(vocab)

val_iterator = BucketIterator(
    batch_size=config.eval_batch_size,
    sorting_keys=[("text", "num_tokens")],
)
val_iterator.index_with(vocab)

if args.embedding_type == 'glove':
    param_dict = {
        "pretrained_file":
        "(https://nlp.stanford.edu/data/glove.6B.zip)#glove.6B.300d.txt",
        "embedding_dim": 300
    }
    params = Params(params=param_dict)
    token_embedding = Embedding.from_params(vocab=vocab, params=params)
elif args.embedding_type == 'elmo':
    token_embedding = ElmoTokenEmbedder(args.options_file,
                                        args.weights_file,
                                        requires_grad=args.finetune_embeddings)

word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

if args.encoder_type == 'bag':
    encoder = BagOfEmbeddingsEncoder(word_embeddings.get_output_dim())
elif args.encoder_type == 'lstm':
    encoder = PytorchSeq2VecWrapper(
        torch.nn.LSTM(word_embeddings.get_output_dim(),
                      config.hidden_sz,
                      bidirectional=True,
                      batch_first=True))
예제 #26
0
    def __init__(
        self,
        vocab: Vocabulary,
        text_field_embedder: TextFieldEmbedder,
        encoder: Seq2SeqEncoder,
        initializer: InitializerApplicator = InitializerApplicator(),
        regularizer: Optional[RegularizerApplicator] = None,
        word_embedding_dim: int = 200,
        hidden_dim: int = 200,
        dropout_emb: float = 0.5,
        min_dec_step: int = 2,
        max_decoding_steps=3,
        fix_edu_num=-1,
        dropout: float = 0.5,
        alpha: float = 0.5,
        span_encoder_type='self_attentive',
        use_elmo: bool = True,
        attn_type: str = 'general',
        schedule_ratio_from_ground_truth: float = 0.8,
        pretrain_embedding_file=None,
        nenc_lay: int = 2,
        mult_orac_sampling: bool = False,
        word_token_indexers=None,
        compression: bool = True,
        dbg: bool = False,
        dec_avd_trigram_rep: bool = True,
        aggressive_compression: int = -1,
        compress_leadn: int = -1,
        subsentence: bool = False,
        gather='mean',
        keep_threshold: float = 0.5,
        abs_board_file: str = "/home/cc/exComp/board.txt",
        abs_dir_root: str = "/scratch/cluster/jcxu",
        serilization_name: str = "",
    ) -> None:

        super(Seq2IdxSum, self).__init__(vocab, regularizer)
        self.text_field_embedder = text_field_embedder

        elmo_weight = os.path.join(
            abs_dir_root, "elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5")
        # if not os.path.isfile(elmo_weight):
        #     import subprocess
        #     x = "wget https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5 -P {}".format(abs_dir_root)
        #     subprocess.run(x.split(" "))

        self.device = get_device()
        self.vocab = vocab
        self.dbg = dbg
        self.loss_thres = keep_threshold
        self.compression = compression
        self.comp_leadn = compress_leadn
        # Just encode the whole document without looking at compression options
        self.enc_doc = EncDoc(inp_dim=word_embedding_dim,
                              hid_dim=hidden_dim,
                              vocab=vocab,
                              dropout=dropout,
                              dropout_emb=dropout_emb,
                              pretrain_embedding_file=pretrain_embedding_file,
                              gather=gather)

        self.sent_dec = SentRNNDecoder(
            rnn_type='lstm',
            dec_hidden_size=self.enc_doc.get_output_dim(),
            dec_input_size=self.enc_doc.get_output_dim(),
            dropout=dropout,
            fixed_dec_step=fix_edu_num,
            max_dec_steps=max_decoding_steps,
            min_dec_steps=min_dec_step,
            schedule_ratio_from_ground_truth=schedule_ratio_from_ground_truth,
            dec_avd_trigram_rep=dec_avd_trigram_rep,
            mult_orac_sample_one=mult_orac_sampling,
            abs_board_file=abs_board_file,
            valid_tmp_path=abs_dir_root,
            serilization_name=serilization_name)
        if compression:
            self.compression_dec = CompressDecoder(
                context_dim=hidden_dim * 2,
                dec_state_dim=hidden_dim * 2,
                enc_hid_dim=hidden_dim,
                text_field_embedder=self.enc_doc._text_field_embedder,
                aggressive_compression=aggressive_compression,
                keep_threshold=keep_threshold,
                abs_board_file=abs_board_file,
                gather=gather,
                dropout=dropout,
                dropout_emb=dropout_emb,
                valid_tmp_path=abs_dir_root,
                serilization_name=serilization_name,
                vocab=vocab,
                elmo=use_elmo,
                elmo_weight=elmo_weight)
            self.aggressive_compression = aggressive_compression

        self.use_elmo = use_elmo
        if use_elmo:
            options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json"
            weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5"
            self.elmo = Elmo(options_file, weight_file, 1, dropout=0)
            # print(self.elmo.get_output_dim())
            self._context_layer = PytorchSeq2SeqWrapper(
                torch.nn.LSTM(word_embedding_dim + self.elmo.get_output_dim(),
                              hidden_dim,
                              batch_first=True,
                              bidirectional=True))
        else:

            self._context_layer = PytorchSeq2SeqWrapper(
                torch.nn.LSTM(word_embedding_dim,
                              hidden_dim,
                              batch_first=True,
                              bidirectional=True))

        token_embedding = Embedding(
            num_embeddings=vocab.get_vocab_size('tokens'),
            embedding_dim=word_embedding_dim)
        if pretrain_embedding_file is not None:
            logger = logging.getLogger()
            logger.info(
                "Loading word embedding: {}".format(pretrain_embedding_file))
            token_embedding.from_params(vocab=vocab,
                                        params=Params({
                                            "pretrained_file":
                                            pretrain_embedding_file,
                                            "embedding_dim":
                                            word_embedding_dim
                                        }))
        self._text_field_embedder = BasicTextFieldEmbedder(
            {"tokens": token_embedding})

        # if span_encoder_type == 'self_attentive':
        #     self._span_encoder = SelfAttentiveSpanExtractor(
        #         self._context_layer.get_output_dim()
        #     )
        # else:
        #     raise NotImplementedError

        self._dropout = torch.nn.Dropout(p=dropout)
        self._max_decoding_steps = max_decoding_steps
        self._fix_edu_num = fix_edu_num
        if compression:
            pass
            # self.rouge_metrics_compression = self.compression_dec.rouge_metrics_compression
            # self.rouge_metrics_compression_upper_bound = self.compression_dec.rouge_metrics_compression_best_possible
        self.rouge_metrics_sent = self.sent_dec.rouge_metrics_sent
        self.mult_orac_sampling = mult_orac_sampling
        self.alpha = alpha
        initializer(self)
        if regularizer is not None:
            regularizer(self)
        self.counter = 0  # used for controlling compression and extraction
예제 #27
0
 def from_params(cls, vocab, params):
     return UncontextualizedEmbedding(
         embedding=Embedding.from_params(vocab, params))
예제 #28
0
    def __init__(self, params: Params, vocab: Vocabulary) -> None:
        super().__init__(vocab=vocab)

        enc_hidden_dim = params.pop_int('enc_hidden_dim', 300)
        gen_hidden_dim = params.pop_int('gen_hidden_dim', 300)
        disc_hidden_dim = params.pop_int('disc_hidden_dim', 1200)
        disc_num_layers = params.pop_int('disc_num_layers', 1)
        code_dist_type = params.pop_choice('code_dist_type',
                                           ['gaussian', 'vmf'],
                                           default_to_first_choice=True)
        code_dim = params.pop_int('code_dim', 300)
        label_emb_dim = params.pop_int('label_emb_dim', 50)
        shared_encoder = params.pop_bool('shared_encoder', True)
        tie_embedding = params.pop_bool('tie_embedding', False)
        auto_weighting = params.pop_bool('auto_weighting', False)

        emb_dropout = params.pop_float('emb_dropout', 0.0)
        disc_dropout = params.pop_float('disc_dropout', 0.0)
        l2_weight = params.pop_float('l2_weight', 0.0)

        self.emb_dropout = nn.Dropout(emb_dropout)
        self.disc_dropout = nn.Dropout(disc_dropout)
        self._l2_weight = l2_weight
        self.auto_weighting = auto_weighting

        self._token_embedder = Embedding.from_params(
            vocab=vocab, params=params.pop('token_embedder'))
        self._label_embedder = Embedding(num_embeddings=self._NUM_LABELS,
                                         embedding_dim=label_emb_dim)
        self._encoder = PytorchSeq2VecWrapper(
            nn.LSTM(input_size=self._token_embedder.get_output_dim(),
                    hidden_size=enc_hidden_dim,
                    batch_first=True))
        self._generator = PytorchSeq2SeqWrapper(
            nn.LSTM(input_size=(self._token_embedder.get_output_dim() +
                                code_dim + label_emb_dim),
                    hidden_size=gen_hidden_dim,
                    batch_first=True))
        self._generator_projector = nn.Linear(
            in_features=self._generator.get_output_dim(),
            out_features=vocab.get_vocab_size())
        self._discriminator_encoder = PytorchSeq2VecWrapper(
            nn.LSTM(input_size=self._token_embedder.get_output_dim(),
                    hidden_size=enc_hidden_dim,
                    batch_first=True))
        if shared_encoder:
            self._discriminator_encoder = self._encoder
        if tie_embedding:
            self._generator_projector.weight = self._token_embedder.weight

        self._discriminator = FeedForward(
            input_dim=4 * self._discriminator_encoder.get_output_dim(),
            hidden_dims=[disc_hidden_dim] * disc_num_layers +
            [self._NUM_LABELS],
            num_layers=disc_num_layers + 1,
            activations=[Activation.by_name('relu')()] * disc_num_layers +
            [Activation.by_name('linear')()],
            dropout=disc_dropout)
        if code_dist_type == 'vmf':
            vmf_kappa = params.pop_int('vmf_kappa', 150)
            self._code_generator = VmfCodeGenerator(
                input_dim=self._encoder.get_output_dim(),
                code_dim=code_dim,
                kappa=vmf_kappa)
        elif code_dist_type == 'gaussian':
            self._code_generator = GaussianCodeGenerator(
                input_dim=self._encoder.get_output_dim(), code_dim=code_dim)
        else:
            raise ValueError('Unknown z_dist')

        self._kl_weight = 1.0
        self._discriminator_weight = params.pop_float('discriminator_weight',
                                                      0.1)
        self._gumbel_temperature = 1.0

        self._use_sampling = params.pop_bool('use_sampling', False)

        if auto_weighting:
            self.num_tasks = num_tasks = 3
            self.task_weights = nn.Parameter(torch.zeros(num_tasks))

        # Metrics
        self._metrics = {
            'labeled': {
                'generator_loss': ScalarMetric(),
                'kl_divergence': ScalarMetric(),
                'discriminator_entropy': ScalarMetric(),
                'discriminator_accuracy': CategoricalAccuracy(),
                'discriminator_loss': ScalarMetric(),
                'loss': ScalarMetric()
            },
            'unlabeled': {
                'generator_loss': ScalarMetric(),
                'kl_divergence': ScalarMetric(),
                'discriminator_entropy': ScalarMetric(),
                'loss': ScalarMetric()
            },
            'aux': {
                'discriminator_entropy': ScalarMetric(),
                'discriminator_accuracy': CategoricalAccuracy(),
                'discriminator_loss': ScalarMetric(),
                'gumbel_temperature': ScalarMetric(),
                'loss': ScalarMetric(),
                'code_log_prob': ScalarMetric(),
                'cosine_dist': ScalarMetric()
            }
        }
예제 #29
0

token_indexer = SingleIdTokenIndexer()
reader = lstmDatasetReader(token_indexers={"tokens": token_indexer})
full_dataset = reader.read("pol_train_semibalanced.csv")
train_size = int(0.8 * len(full_dataset))
test_size = len(full_dataset) - train_size
train_dataset, validation_dataset = random_split(full_dataset,
                                                 [train_size, test_size])
test_dataset = reader.read("pol_test_semibalanced.csv")

vocab = Vocabulary.from_instances(train_dataset + validation_dataset,
                                  max_vocab_size=config.max_vocab_size)
token_embedder = Embedding.from_params(vocab=vocab,
                                       params=Params({
                                           'pretrained_file':
                                           'glove.twitter.27B.50d.txt',
                                           'embedding_dim': 50
                                       }))
word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedder})

#Iterrator: batching data + preparing it for input
from allennlp.data.iterators import BucketIterator

iterator = BucketIterator(batch_size=config.batch_size,
                          sorting_keys=[("tokens", "num_tokens")],
                          max_instances_in_memory=512)
iterator.index_with(vocab)

lstm = PytorchSeq2VecWrapper(
    nn.LSTM(word_embeddings.get_output_dim(),
            config.hidden_sz,
def save_top_results(process_no, start_index, end_index):
    print("Starting process {} with start at {} and end at {}".format(
        process_no, start_index, end_index))
    DATA_FOLDER = "train_data"
    # EMBEDDING_TYPE = ""
    LOSS_TYPE = ""  # NLL
    LOSS_TYPE = "_mse"  # MSE
    # EMBEDDING_TYPE = ""
    # EMBEDDING_TYPE = "_glove"
    # EMBEDDING_TYPE = "_bert"
    EMBEDDING_TYPE = "_elmo"
    # EMBEDDING_TYPE = "_elmo_retrained"
    # EMBEDDING_TYPE = "_elmo_retrained_2"
    token_indexers = None
    if EMBEDDING_TYPE == "_elmo" or EMBEDDING_TYPE == "_elmo_retrained" or EMBEDDING_TYPE == "_elmo_retrained_2":
        token_indexers = {"tokens": ELMoTokenCharactersIndexer()}
    MAX_BATCH_SIZE = 0
    # MAX_BATCH_SIZE = 150 # for bert and elmo
    # q_file = os.path.join("squad_seq2seq_train", "rule_based_system_squad_seq2seq_train_case_sensitive_saved_questions_lexparser_sh.txt")
    # r_file = os.path.join("squad_seq2seq_train", "rule_based_system_squad_seq2seq_train_case_sensitive_generated_answers_lexparser_sh.txt")
    # rules_file = os.path.join("squad_seq2seq_train", "rule_based_system_squad_seq2seq_train_case_sensitive_generated_answer_rules_lexparser_sh.txt")

    #NOTE: Squad dev test set
    q_file = os.path.join(
        "squad_seq2seq_dev_moses_tokenized",
        "rule_based_system_squad_seq2seq_dev_test_saved_questions.txt")
    r_file = os.path.join(
        "squad_seq2seq_dev_moses_tokenized",
        "rule_based_system_squad_seq2seq_dev_test_generated_answers.txt")
    rules_file = os.path.join(
        "squad_seq2seq_dev_moses_tokenized",
        "rule_based_system_squad_seq2seq_dev_test_generated_answer_rules.txt")
    reader = QuestionResponseSoftmaxReader(q_file,
                                           r_file,
                                           token_indexers=token_indexers,
                                           max_batch_size=MAX_BATCH_SIZE)
    glove_embeddings_file = os.path.join("data", "glove",
                                         "glove.840B.300d.txt")
    # RESULTS_DIR = "squad_seq2seq_train2"
    #NOTE: All other experiments
    # RESULTS_DIR = "squad_seq2seq_train_moses_tokenized"
    # make_dir_if_not_exists(RESULTS_DIR)
    # all_results_save_file = os.path.join(RESULTS_DIR, "squad_seq2seq_train_predictions_start_{}_end_{}.txt".format(start_index, end_index))

    #NOTE: Squad dev test set
    RESULTS_DIR = "squad_seq2seq_dev_moses_tokenized"
    make_dir_if_not_exists(RESULTS_DIR)
    all_results_save_file = os.path.join(
        RESULTS_DIR,
        "squad_seq2seq_dev_test_predictions_start_{}_end_{}.txt".format(
            start_index, end_index))

    with open(all_results_save_file, "w") as all_writer:
        print("Testing out model with", EMBEDDING_TYPE, "embeddings")
        print("Testing out model with", LOSS_TYPE, "loss")
        # for NEGATIVE_PERCENTAGE in [100,50,20,10,5,1]:
        for NEGATIVE_PERCENTAGE in [100]:
            model_file = os.path.join(
                "saved_softmax_models",
                "decomposable_attention{}{}_model_{}.th".format(
                    LOSS_TYPE, EMBEDDING_TYPE, NEGATIVE_PERCENTAGE))

            vocabulary_filepath = os.path.join(
                "saved_softmax_models",
                "vocabulary{}{}_{}".format(LOSS_TYPE, EMBEDDING_TYPE,
                                           NEGATIVE_PERCENTAGE))
            print("LOADING VOCABULARY")
            # Load vocabulary
            vocab = Vocabulary.from_files(vocabulary_filepath)

            EMBEDDING_DIM = 300
            PROJECT_DIM = 200
            DROPOUT = 0.2
            NUM_LAYERS = 2
            if EMBEDDING_TYPE == "":
                token_embedding = Embedding(
                    num_embeddings=vocab.get_vocab_size('tokens'),
                    embedding_dim=EMBEDDING_DIM,
                    projection_dim=PROJECT_DIM)
            elif EMBEDDING_TYPE == "_glove":
                token_embedding = Embedding.from_params(
                    vocab=vocab,
                    params=Params({
                        'pretrained_file': glove_embeddings_file,
                        'embedding_dim': EMBEDDING_DIM,
                        'projection_dim': PROJECT_DIM,
                        'trainable': False
                    }))
            elif EMBEDDING_TYPE == "_elmo":
                # options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_options.json"
                # weights_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5"
                options_file = os.path.join(
                    "data", "elmo",
                    "elmo_2x2048_256_2048cnn_1xhighway_options.json")
                weights_file = os.path.join(
                    "data", "elmo",
                    "elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5")
                # NOTE: using Small size as medium size gave CUDA out of memory error
                # options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json"
                # weights_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5"
                # options_file = os.path.join("data", "elmo", "elmo_2x1024_128_2048cnn_1xhighway_options.json")
                # weights_file = os.path.join("data", "elmo", "elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5")
                token_embedding = ElmoTokenEmbedder(options_file,
                                                    weights_file,
                                                    dropout=DROPOUT,
                                                    projection_dim=PROJECT_DIM)
            elif EMBEDDING_TYPE == "_elmo_retrained":
                options_file = os.path.join("data", "bilm-tf",
                                            "elmo_retrained", "options.json")
                weights_file = os.path.join("data", "bilm-tf",
                                            "elmo_retrained", "weights.hdf5")
                token_embedding = ElmoTokenEmbedder(options_file,
                                                    weights_file,
                                                    dropout=DROPOUT,
                                                    projection_dim=PROJECT_DIM)
            elif EMBEDDING_TYPE == "_elmo_retrained_2":
                options_file = os.path.join("data", "bilm-tf",
                                            "elmo_retrained", "options_2.json")
                weights_file = os.path.join("data", "bilm-tf",
                                            "elmo_retrained", "weights_2.hdf5")
                token_embedding = ElmoTokenEmbedder(options_file,
                                                    weights_file,
                                                    dropout=DROPOUT,
                                                    projection_dim=PROJECT_DIM)
            elif EMBEDDING_TYPE == "_bert":
                print("Loading bert model")
                model = BertModel.from_pretrained('bert-base-uncased')
                token_embedding = BertEmbedder(model)
                PROJECT_DIM = 768
            else:
                print("Error: Some weird Embedding type", EMBEDDING_TYPE)
                exit()
            word_embeddings = BasicTextFieldEmbedder(
                {"tokens": token_embedding})
            HIDDEN_DIM = 200
            params = Params({
                'input_dim': PROJECT_DIM,
                'hidden_dims': HIDDEN_DIM,
                'activations': 'relu',
                'num_layers': NUM_LAYERS,
                'dropout': DROPOUT
            })
            attend_feedforward = FeedForward.from_params(params)
            similarity_function = DotProductSimilarity()
            params = Params({
                'input_dim': 2 * PROJECT_DIM,
                'hidden_dims': HIDDEN_DIM,
                'activations': 'relu',
                'num_layers': NUM_LAYERS,
                'dropout': DROPOUT
            })
            compare_feedforward = FeedForward.from_params(params)
            params = Params({
                'input_dim': 2 * HIDDEN_DIM,
                'hidden_dims': 1,
                'activations': 'linear',
                'num_layers': 1
            })
            aggregate_feedforward = FeedForward.from_params(params)
            model = DecomposableAttentionSoftmax(vocab, word_embeddings,
                                                 attend_feedforward,
                                                 similarity_function,
                                                 compare_feedforward,
                                                 aggregate_feedforward)
            print("MODEL CREATED")
            # Load model state
            with open(model_file, 'rb') as f:
                device = torch.device('cpu')
                model.load_state_dict(torch.load(f, map_location=device))
            print("MODEL LOADED!")
            if torch.cuda.is_available():
                # cuda_device = 3
                # model = model.cuda(cuda_device)
                cuda_device = -1
            else:
                cuda_device = -1

            predictor = DecomposableAttentionSoftmaxPredictor(
                model, dataset_reader=reader)
            # Read test file and get predictions
            gold = list()
            predicted_labels = list()
            probs = list()
            total_time = avg_time = 0.0
            print("Started Testing:", NEGATIVE_PERCENTAGE)
            # before working on anything just save all the questions and responses in a list
            all_data = list()
            examples_count = processed_examples_count = 0
            with open(q_file,
                      'r') as q_reader, open(r_file, "r") as r_reader, open(
                          rules_file, "r") as rule_reader:
                logger.info("Reading questions from : %s", q_file)
                logger.info("Reading responses from : %s", r_file)
                q = next(q_reader).lower().strip()
                q = mt.tokenize(q, return_str=True, escape=False)
                current_qa = (q, "")
                current_rules_and_responses = list()
                for i, (response,
                        rule) in enumerate(zip(r_reader, rule_reader)):
                    response = response.strip()
                    rule = rule.strip()
                    if response and rule:
                        # get current_answer from response
                        a = get_answer_from_response(response)
                        if not current_qa[1]:
                            current_qa = (q, a)
                        else:
                            # verify if the a is same as the one in current_qa
                            if a != current_qa[1]:
                                # print("answer phrase mismatch!!", current_qa, ":::", a, ":::", response)
                                current_qa = (current_qa[0], a)
                                # print(current_rules_and_responses)
                                # exit()
                        # Add it to the current responses
                        current_rules_and_responses.append((response, rule))
                    elif len(current_rules_and_responses) > 0:
                        # Create a instance
                        # print(current_qa)
                        # print(current_rules_and_responses)
                        # exit()
                        if rule or response:
                            print("Rule Response mismatch")
                            print(current_qa)
                            print(response)
                            print(rule)
                            print(examples_count)
                            print(i)
                            exit()

                        if examples_count < start_index:
                            examples_count += 1
                            q = next(q_reader).lower().strip()
                            q = mt.tokenize(q, return_str=True, escape=False)
                            current_qa = (q, "")
                            current_rules_and_responses = list()
                            continue
                        elif examples_count > end_index:
                            break

                        all_data.append(
                            (current_qa, current_rules_and_responses))
                        try:
                            q = next(q_reader).lower().strip()
                            q = mt.tokenize(q, return_str=True, escape=False)
                        except StopIteration:
                            # previous one was the last question
                            q = ""
                        current_qa = (q, "")
                        current_rules_and_responses = list()
                        examples_count += 1
                        # if(examples_count%100 == 0):
                        # 	print(examples_count)
                    else:
                        # Serious Bug
                        print("Serious BUG!!")
                        print(current_qa)
                        print(response)
                        print(rule)
                        print(examples_count)
                        print(i)
                        exit()
            print("{}:\tFINISHED IO".format(process_no))
            examples_count = start_index
            processed_examples_count = 0
            for current_qa, responses_and_rules in all_data:
                start_time = time.time()
                # Tokenize and preprocess the responses
                preprocessed_responses = [
                    mt.tokenize(remove_answer_brackets(response),
                                return_str=True,
                                escape=False)
                    for response, rule in responses_and_rules
                ]
                # predictions = predictor.predict(current_qa[0], [remove_answer_brackets(response) for response, rule in responses_and_rules])
                predictions = predictor.predict(current_qa[0],
                                                preprocessed_responses)
                label_probs = predictions["label_probs"]
                tuples = zip(responses_and_rules, label_probs)
                sorted_by_score = sorted(tuples,
                                         key=lambda tup: tup[1],
                                         reverse=True)
                count = 0
                all_writer.write("{}\n".format(current_qa[0]))
                all_writer.write("{}\n".format(current_qa[1]))
                for index, ((response, rule),
                            label_prob) in enumerate(sorted_by_score):
                    if index == 3:
                        break
                    all_writer.write("{}\t{}\t{}\t{}\n".format(
                        response,
                        mt.tokenize(remove_answer_brackets(response),
                                    return_str=True,
                                    escape=False), rule, label_prob))
                all_writer.write("\n")
                all_writer.flush()
                end_time = time.time()
                processed_examples_count += 1
                examples_count += 1
                total_time += end_time - start_time
                avg_time = total_time / float(processed_examples_count)
                print(
                    "{}:\ttime to write {} with {} responses is {} secs. {} avg time"
                    .format(process_no, examples_count,
                            len(responses_and_rules), end_time - start_time,
                            avg_time))
예제 #31
0
        return np.concatenate(preds, axis=0), np.concatenate(labels, axis=0)


token_indexer = SingleIdTokenIndexer()
reader = rnnDatasetReader(token_indexers={"tokens": token_indexer})
test_dataset = reader.read(
    "/home/dkeren/Documents/Spring2019/CIS520/project/pol_test_semibalanced.csv"
)

vocab = Vocabulary.from_files(
    "/tmp/vocabulary")  #preloaded vocab, required to do lazy computations

token_embedder = Embedding.from_params(
    vocab=vocab,
    params=Params({
        'pretrained_file':
        '/home/dkeren/Documents/Spring2019/CIS520/project/glove.twitter.27B.50d.txt',
        'embedding_dim': 50
    }))
word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedder})

lstm = PytorchSeq2VecWrapper(
    nn.LSTM(word_embeddings.get_output_dim(),
            config.hidden_sz,
            bidirectional=True,
            batch_first=True))

save_file = "model_v12.th"  ## models saved by lstm training
model2 = LSTM_Model(word_embeddings, lstm, 2)
with open(save_file, 'rb') as f:
    model2.load_state_dict(torch.load(f))
                "type": "gru",
                "input_size": 100,
                "hidden_size": 50,
                "num_layers": 2,
                "dropout": 0.25,
                "bidirectional": True
            }
        }
    })
    #text_field_embedder = TextFieldEmbedder.from_params(text_field_embedder_cfg,vocab=vocab)

    # token based -> maybe do average from this
    glove_text_field_embedder = Embedding.from_params(
        vocab,
        Params({
            "pretrained_file":
            "https://s3-us-west-2.amazonaws.com/allennlp/datasets/glove/glove.6B.100d.txt.gz",
            "embedding_dim": 100,
            "trainable": False
        }))

    #text_field_embedder= TextFieldEmbedder.from_params(text_field_embedder_cfg)
    # """You need to be sure that the TextFieldEmbedder is expecting the same thing that your DatasetReader is producing, but that happens in the configuration file, and we'll talk about it later."""

    trainer_cfg = Params({
        "iterator": {
            "type": "basic",
            "batch_size": 32
        },
        "trainer": {
            "optimizer": {
                "type": "adam"