Ejemplo n.º 1
0
 def __init__(self, model_type: str, max_seq_len: int = 128):
     self.model_type = model_type
     self.max_seq_len = max_seq_len
     self.token_indexer = PretrainedBertIndexer(
         pretrained_model=self.model_type,
         max_pieces=self.max_seq_len,
         do_lowercase=True,
     )
     self.vocab = Vocabulary()
     self.token_indexer._add_encoding_to_vocabulary(self.vocab)
     self.full_vocab = {v: k for k, v in self.token_indexer.vocab.items()}
Ejemplo n.º 2
0
    def __init__(self,
                 lazy: bool = False,
                 question_token_indexers: Dict[str, TokenIndexer] = None,
                 keep_if_unparsable: bool = True,
                 tables_file: str = None,
                 dataset_path: str = 'dataset/database',
                 load_cache: bool = True,
                 save_cache: bool = True,
                 loading_limit=-1):
        super().__init__(lazy=lazy)

        # default spacy tokenizer splits the common token 'id' to ['i', 'd'], we here write a manual fix for that
        token_indexer = PretrainedBertIndexer(
            pretrained_model="bert-base-uncased",
            do_lowercase=True,
            use_starting_offsets=True)
        self._tokenizer = Bert_Tokenizer(token_indexer.wordpiece_tokenizer)

        self._utterance_token_indexers = question_token_indexers or {
            'tokens': token_indexer
        }
        self._keep_if_unparsable = keep_if_unparsable

        self._tables_file = tables_file
        self._dataset_path = dataset_path

        self._load_cache = load_cache
        self._save_cache = save_cache
        self._loading_limit = loading_limit
Ejemplo n.º 3
0
    def __init__(self,
                 word_indexer: Optional[TokenIndexer] = None,
                 is_bert: bool = False,
                 conceptnet_path: Optional[Path] = None):
        super().__init__(lazy=False)
        self.pos_indexers = {"pos_tokens": PosTagIndexer()}
        self.ner_indexers = {"ner_tokens": NerTagIndexer()}
        self.rel_indexers = {
            "rel_tokens": SingleIdTokenIndexer(namespace='rel_tokens')
        }

        if is_bert:
            splitter = BertBasicWordSplitter()
        else:
            splitter = SpacyWordSplitter()
        self.tokeniser = WordTokenizer(word_splitter=splitter)

        self.word_indexers = {'tokens': word_indexer}
        word_splitter = SpacyWordSplitter(pos_tags=True, ner=True, parse=True)
        self.word_tokeniser = WordTokenizer(word_splitter=word_splitter)
        bert_splitter = BertBasicWordSplitter()
        self.bert_tokeniser = WordTokenizer(word_splitter=bert_splitter)

        if word_indexer is None:
            if is_bert:
                word_indexer = PretrainedBertIndexer(
                    pretrained_model='bert-base-uncased',
                    truncate_long_sequences=False)
            else:
                word_indexer = SingleIdTokenIndexer(lowercase_tokens=True)
        self.word_indexers = {'tokens': word_indexer}

        self.conceptnet = ConceptNet(conceptnet_path=conceptnet_path)
Ejemplo n.º 4
0
def get_bert_token_indexers(
        path_to_bert_weights: str,
        maximum_number_of_tokens: int,
        is_model_lowercase: bool = True) -> PretrainedBertIndexer:
    """
    Retrieving bert based token indexers (which will do sub-word tokenizing)
    
    Parameters
    ----------
    path_to_bert_weights: `str`, required
        The path to the pytorch bert weights

    maximum_number_of_tokens: `int`, required
        The maximum number of tokens (truncation or sliding window based on allennlp design)

    is_model_lowercase: `bool` optional (default=True)
        Force lower casing the input to the model

    Returns
    ----------
    Returns the required instance of :class:`PretrainedBertIndexer`.
    """

    token_indexers = PretrainedBertIndexer(pretrained_model=os.path.abspath(
        os.path.join(path_to_bert_weights, 'vocab.txt')),
                                           max_pieces=maximum_number_of_tokens,
                                           do_lowercase=is_model_lowercase,
                                           use_starting_offsets=True)

    return token_indexers
Ejemplo n.º 5
0
def get_embedder_info(
    embedder_type: str
) -> Tuple[TokenEmbedder, TokenIndexer, str, Dict[str, Any]]:
    embedder_type = embedder_type.lower()
    text_field_embedder_kwargs: Dict[str, Any] = {}
    if embedder_type == 'ner_elmo':
        return NERElmoTokenEmbedder(), ELMoTokenCharactersIndexer(
        ), text_field_embedder_kwargs
    elif embedder_type == 'elmo':
        return ElmoTokenEmbedder(ELMO_OPTIONS_FILE,
                                 ELMO_WEIGHT_FILE), ELMoTokenCharactersIndexer(
                                 ), text_field_embedder_kwargs
    elif embedder_type == 'bert':
        bert_embedder = PretrainedBertEmbedder(
            pretrained_model="bert-base-uncased",
            top_layer_only=True,  # conserve memory
        )
        token_indexer = PretrainedBertIndexer(
            pretrained_model="bert-base-uncased",
            max_pieces=512,  # max pieces allowed for positional embeddings
            do_lowercase=True,
            use_starting_offsets=True,
        )
        text_field_embedder_kwargs['allow_unmatched_keys'] = True
        text_field_embedder_kwargs['embedder_to_indexer_map'] = {
            "tokens": ["tokens", "tokens-offsets"]
        }

        return bert_embedder, token_indexer, text_field_embedder_kwargs
    else:
        raise Exception(f'Unknown embedder type: {embedder_type}')
Ejemplo n.º 6
0
def get_token_utils(name: str = config.embedder):
    if name == 'elmo':
        from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter
        from allennlp.data.token_indexers.elmo_indexer import ELMoCharacterMapper, ELMoTokenCharactersIndexer

        # the token indexer is responsible for mapping tokens to integers
        token_indexer = ELMoTokenCharactersIndexer()

        def tokenizer(x: str):
            return [
                w.text for w in SpacyWordSplitter(language='en_core_web_sm',
                                                  pos_tags=False).split_words(
                                                      x)[:config.max_seq_len]
            ]

        return token_indexer, tokenizer
    elif name == 'bert':
        from allennlp.data.token_indexers import PretrainedBertIndexer

        token_indexer = PretrainedBertIndexer(
            pretrained_model="bert-base-uncased",
            max_pieces=config.max_seq_len,
            do_lowercase=True,
        )

        def tokenizer(s: str):
            return token_indexer.wordpiece_tokenizer(s)[:config.max_seq_len -
                                                        2]

        return token_indexer, tokenizer
Ejemplo n.º 7
0
def load_data(train_dir, test_dir):
    token_indexer = PretrainedBertIndexer(
        pretrained_model="bert-base-uncased",
        max_pieces=config.max_seq_len,
        do_lowercase=True,
    )

    reader = LoadData(tokenizer=tokenizer,
                      token_indexers={"tokens": token_indexer},
                      col_name=config.col_name)

    train_data = pd.read_csv(train_dir)
    test_data = pd.read_csv(test_dir)

    test_y = test_data[config.col_name[1]].tolist()

    train_data, val_data = train_test_split(train_data,
                                            test_size=0.1,
                                            random_state=42)

    train_data = reader.read(train_data)
    val_data = reader.read(val_data)
    test_data = reader.read(test_data)

    return train_data, val_data, test_data, test_y
def get_dataset_reader():
    """
    Get dataset reader instance
    """
    # Fixed configuration
    config = Config(
        use_percentage=1.0, # how much data to use
        max_seq_len=512, # necessary to limit memory usage
        use_extracted_emb=False, # set False since we are using BERT
        batch_size=32 # for testing, this does not matter
    )

    # BERT wordpiece tokenizer
    token_indexer = PretrainedBertIndexer(
        pretrained_model="bert-base-multilingual-cased", # recommended by Google
        max_pieces=config.max_seq_len,
        do_lowercase=False,
    )
    def tokenizer(s: str, max_seq_len: int=config.max_seq_len) -> List[str]:
        return [Token(x) for x in token_indexer.wordpiece_tokenizer(s)[:max_seq_len]]

    # dataset reader
    reader = PairDatasetReaderSpecial(
        tokenizer=tokenizer,
        token_indexers={"tokens": token_indexer},
        use_percentage=config.use_percentage,
        use_extracted_emb=config.use_extracted_emb # set False if finetuning
    )
    return config, reader
Ejemplo n.º 9
0
def predict(vocab2):
	bert_token_indexer = PretrainedBertIndexer(
	    pretrained_model="bert-large-uncased",
	    max_pieces=config.max_seq_len,
	    do_lowercase=True,
	)
	reader = BertAnalogyDatasetReader(
		tokenizer=bert_tokenizer, 
		token_indexers={'tokens':bert_token_indexer}
	)	

	train_dataset, test_dataset, dev_dataset = (reader.read(DATA_ROOT + "/" + fname) for fname in ["train_all.txt", "test_all.txt", "val_all.txt"])

	bert_embedder = PretrainedBertEmbedder(
	         pretrained_model='bert-large-uncased',
	         top_layer_only=True, # conserve memory
	)
	word_embeddings: TextFieldEmbedder = BasicTextFieldEmbedder({"tokens": bert_embedder},
	                                                             # we'll be ignoring masks so we'll need to set this to True
	                                                            allow_unmatched_keys = True)

	BERT_DIM = word_embeddings.get_output_dim()
	class BertSentencePooler(Seq2VecEncoder):
	    def forward(self, embs: torch.tensor, 
	                mask: torch.tensor=None) -> torch.tensor:
	        # extract first token tensor
	        return embs[:, 0]
	    
	    @overrides
	    def get_output_dim(self) -> int:
	        return BERT_DIM
	        
	# if not vocab2: 
	# 	vocab2 = Vocabulary.from_files("./bert_vocabulary")

	bert_encoder = BertSentencePooler(vocab2)
	model2 = LstmModel(word_embeddings, bert_encoder, vocab2)
	if USE_GPU: model2.cuda()
	else: model2

	with open("./bert_model.th", 'rb') as f:
		model2.load_state_dict(torch.load(f))
	
	predictor2 = SentenceClassifierPredictor(model2, dataset_reader=reader)
	with open('bert_predictions.txt', 'w+') as f:
		top_10_words_list = []
		for analogy_test in test_dataset:
			logits = predictor2.predict_instance(analogy_test)['logits']
			label_id = np.argmax(logits)
			label_predict = model2.vocab.get_token_from_index(label_id, 'labels')

			top_10_ids = np.argsort(logits)[-10:]
			top_10_words = [model2.vocab.get_token_from_index(id, 'labels') for id in top_10_ids]
			top_10_words_list.append(top_10_words)
			f.write(label_predict + "\n")

	top_10_words_list = np.array(top_10_words_list)
	print(top_10_words_list.shape)
	np.save('bert_top_10_words_list.npy', np.array(top_10_words_list))
Ejemplo n.º 10
0
 def __init__(self):
     super().__init__(lazy=False)
     self.token_indexers = {
         "tokens":
         PretrainedBertIndexer(pretrained_model="bert-base-cased",
                               do_lowercase=False
                               #max_pieces=config.max_seq_length
                               )
     }
Ejemplo n.º 11
0
    def __init__(self, word_indexer: Optional[TokenIndexer] = None):
        super().__init__(lazy=False)

        splitter = BertBasicWordSplitter()
        self.tokeniser = WordTokenizer(word_splitter=splitter)

        if word_indexer is None:
            word_indexer = PretrainedBertIndexer(
                pretrained_model='bert-base-uncased',
                truncate_long_sequences=False)
        self.word_indexers = {'tokens': word_indexer}
Ejemplo n.º 12
0
def get_indexer(embedding_type: str,
                xlnet_vocab_file: Path
                ) -> TokenIndexer:
    if embedding_type == 'bert':
        return PretrainedBertIndexer(
            pretrained_model='bert-base-uncased',
            truncate_long_sequences=False)
    if embedding_type == 'glove':
        return SingleIdTokenIndexer(lowercase_tokens=True)
    if embedding_type == 'xlnet':
        return XLNetIndexer(vocab_file=str(xlnet_vocab_file))
Ejemplo n.º 13
0
 def __init__(
     self,
     target_namespace: str,
     lazy: bool = False,
 ) -> None:
     super().__init__(lazy)
     self._target_namespace = target_namespace
     self._source_token_indexers: Dict[str, TokenIndexer] = {
         "bert": PretrainedBertIndexer('bert-base-uncased')
     }
     self._target_token_indexers: Dict[str, TokenIndexer] = {
         "tokens": SingleIdTokenIndexer(namespace=self._target_namespace)
     }
Ejemplo n.º 14
0
 def __init__(self,
              tokenizer: Tokenizer = None,
              token_indexers: Dict[str, TokenIndexer] = None,
              lazy: bool = False) -> None:
     super().__init__(lazy)
     self._tokenizer = tokenizer or JiebaTokenizer(
         word_splitter=JiebaSplitter())
     self._token_indexers = token_indexers or {
         'tokens':
         PretrainedBertIndexer(
             "./bert/multi_cased_L-12_H-768_A-12/bert-base-multilingual-cased-vocab.txt"
         )
     }
Ejemplo n.º 15
0
def load_stackex_data(data_root,
                      assign_id_file,
                      encoder,
                      train_type,
                      max_seq_len,
                      toy_data,
                      use_tokenizer=True,
                      is_rank_task=True):
    # the token indexer is responsible for mapping tokens to integers

    def bert_tokenizer(s: str):
        return token_indexer.wordpiece_tokenizer(s)[:max_seq_len - 2]

    def normal_tokenizer(x: str):
        return [
            w.text for w in SpacyWordSplitter(language='en_core_web_sm',
                                              pos_tags=False).split_words(x)
            [:max_seq_len]
        ]

    if encoder == "bert":
        token_indexer = PretrainedBertIndexer(
            pretrained_model="bert-base-uncased",
            max_pieces=max_seq_len,
            do_lowercase=True,
        )
        tokenizer = bert_tokenizer
    else:
        token_indexer = SingleIdTokenIndexer()
        tokenizer = normal_tokenizer

    # init dataset reader
    reader = StackExDataReader(assignee_id_file=assign_id_file,
                               tokenizer=tokenizer,
                               token_indexers={"tokens": token_indexer},
                               is_rank_task=is_rank_task,
                               toy_data=toy_data,
                               use_tokenizer=use_tokenizer)

    if train_type == "seq":
        train_pkl, val_pkl, test_pkl = "train.pkl", "val.pkl", "test.pkl"
    else:
        train_pkl, val_pkl, test_pkl = "train_" + train_type + ".pkl", "val_" + train_type + ".pkl", "test_" + train_type + ".pkl"

    # elif train_type == "single":
    #     train_pkl, test_pkl = "train_single.pkl", "test_single.pkl"

    train_ds, val_ds, test_ds = (reader.read(data_root / fname)
                                 for fname in [train_pkl, val_pkl, test_pkl])

    return reader, train_ds, val_ds, test_ds
Ejemplo n.º 16
0
def get_pretrained_bert_indexer(model_name: str = 'bert-base-uncased',
                                cache_dir: str = PATH_ALLENNLP_CACHE,
                                max_length=512,
                                lowercase=True,
                                **kwargs):
    model_path = path.join(cache_dir, 'bert', f'{model_name}-vocab.txt')
    msgex.assert_path_exist(
        path_str=model_path,
        arg_name='model_path',
        extra_msg=f"the specified BERT model '{model_name}' is not found")
    return PretrainedBertIndexer(pretrained_model=model_path,
                                 max_pieces=max_length,
                                 do_lowercase=lowercase,
                                 **kwargs)
Ejemplo n.º 17
0
 def get_token_indexer(self, token_indexers):
     self.token_indexers = token_indexers or {
         "tokens": SingleIdTokenIndexer()
     }
     # the token indexer is responsible for mapping tokens to integers
     if self.embeddings == 'elmo':
         self.token_indexers = {"tokens": ELMoTokenCharactersIndexer()}
     elif self.embeddings == 'bert':
         self.ber_embedder = PretrainedBertIndexer(
             pretrained_model="bert-base-uncased",
             max_pieces=128,
             do_lowercase=True,
         )
         self.token_indexers = {"bert": self.ber_embedder}
Ejemplo n.º 18
0
    def test_reader_from_file(self):
        reader = BaselineReader(
            idiom_vector_path=self.IDIOM_VECTOR_PATH,
            pretrained_model=self.PRETRAINED_MODEL,
            content_token_indexer={
                "bert": PretrainedBertIndexer(
                    self.PRETRAINED_MODEL
                )
            },
            max_seq_length=256
        )
        instances = reader.read(str(self.FIXTURES_ROOT / "data" / "realcount_3_sample.txt"))
        instances = ensure_list(instances)

        assert len(instances) == 3
Ejemplo n.º 19
0
    def __init__(self, lazy: bool = False,
                 load_cache: bool = True,
                 save_cache: bool = True,
                 loading_ratio: float = 1.0,
                 enable_unparse: bool = True,
                 use_bert: bool = False,
                 # insert before/after/both
                 super_mode: str = 'before',
                 word_level: bool = False,
                 # if joint encoding, means that
                 joint_encoding: bool = False,
                 language: str = 'zh',
                 extra_stop_words: List = None):
        super().__init__(lazy=lazy)

        # different languages share the same word splitter
        self._tokenizer = WordTokenizer(JustSpacesWordSplitter())

        if use_bert:
            pretrain_model_name = 'bert-base-chinese' if language == 'zh' else 'bert-base-uncased'
            self._indexer = {'bert': PretrainedBertIndexer(pretrained_model=pretrain_model_name,
                                                           use_starting_offsets=False,
                                                           do_lowercase=True,
                                                           never_lowercase=['[UNK]', '[PAD]', '[CLS]', '[SEP]'],
                                                           truncate_long_sequences=False)}
        else:
            self._indexer = {'tokens': SingleIdTokenIndexer(namespace='tokens')}
        # loading ratio is designed for hyper-parameter fine-tuning
        self._loading_ratio = loading_ratio
        self._load_cache = load_cache
        self._save_cache = save_cache

        self._use_bert = use_bert
        self._language = language
        self._word_level = word_level
        self._super_mode = super_mode
        self._enable_unparse = enable_unparse
        self._joint_encoding = joint_encoding
        self._extra_stop_words = extra_stop_words
Ejemplo n.º 20
0
def set_values(max_sequence_length: Optional[int] = -1,
               concat_title_abstract: Optional[bool] = None,
               data_source: Optional[str] = None,
               included_text_fields: Optional[str] = None) -> None:
    # set global values
    # note: a class with __init__ would have been a better design
    # we have this structure for efficiency reasons: to support multiprocessing
    # as multiprocessing with class methods is slower
    global _tokenizer
    global _token_indexers
    global _token_indexer_author_id
    global _token_indexer_author_position
    global _token_indexer_venue
    global _token_indexer_id
    global _max_sequence_length
    global _concat_title_abstract
    global _data_source
    global _included_text_fields

    if _tokenizer is None:  # if not initialized, initialize the tokenizers and token indexers
        _tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter(
            do_lower_case=bert_params["do_lowercase"]))
        _token_indexers = {
            "bert": PretrainedBertIndexer.from_params(Params(bert_params))
        }
        _token_indexer_author_id = {
            "tokens": SingleIdTokenIndexer(namespace='author')
        }
        _token_indexer_author_position = {
            "tokens": SingleIdTokenIndexer(namespace='author_positions')
        }
        _token_indexer_venue = {
            "tokens": SingleIdTokenIndexer(namespace='venue')
        }
        _token_indexer_id = {"tokens": SingleIdTokenIndexer(namespace='id')}
    _max_sequence_length = max_sequence_length
    _concat_title_abstract = concat_title_abstract
    _data_source = data_source
    _included_text_fields = included_text_fields
Ejemplo n.º 21
0
    def __init__(self,
                 is_bert: bool,
                 conceptnet_path: Path,
                 word_indexer: Optional[TokenIndexer] = None):
        super().__init__(lazy=False)

        if is_bert:
            splitter = BertBasicWordSplitter()
        else:
            splitter = SpacyWordSplitter()
        self.tokeniser = WordTokenizer(word_splitter=splitter)

        if word_indexer is None:
            if is_bert:
                word_indexer = PretrainedBertIndexer(
                    pretrained_model='bert-base-uncased',
                    truncate_long_sequences=True)
            else:
                word_indexer = SingleIdTokenIndexer(lowercase_tokens=True)
        self.word_indexers = {'tokens': word_indexer}

        # self.rel_indexers = {
        #     "rel_tokens": SingleIdTokenIndexer(namespace='rel_tokens')}
        self.conceptnet = ConceptNet(conceptnet_path=conceptnet_path)
Ejemplo n.º 22
0
        assert(len(sent_embeddings) == len(rel_labels))
       
        return sentences, sent_embeddings, rel_labels


"""
Example
"""
if __name__ == "__main__": 
    working_dir = "../data/ICNALE/original/pairwise_link_labelling/test/"
    
    # BERT wordpiece tokenizer
    max_seq_len = 512 # maximum number of tokens when tokenizing a text
    token_indexer = PretrainedBertIndexer(
        pretrained_model="bert-base-multilingual-cased", # recommended by Google
        max_pieces=max_seq_len,
        do_lowercase=False,
    )

    def tokenizer(s: str, max_seq_len: int=512) -> List[str]:
        return [Token(x) for x in token_indexer.wordpiece_tokenizer(s)[:max_seq_len]]

    # reading dataset
    reader = PairDatasetReader(
        tokenizer=None, # None if you do not want to tokenize
        token_indexers={"tokens": token_indexer},
        use_percentage=0.1,
        use_extracted_emb=False # set False if finetuning
    )

    train_ds = reader.read(working_dir)
Ejemplo n.º 23
0
                TextField(tk, self._token_indexers)
                for tk in aug_unsup_tokenized_sents
            ])

            fields['ori_unsup_sentences'] = ori_unsup_sentence_sequence
            fields['aug_unsup_sentences'] = aug_unsup_sentence_sequence
        # Fake data
        # fields['ori_unsup_sentences'] = sentence_sequence
        # fields['aug_unsup_sentences'] = sentence_sequence

        return Instance(fields)


# %%
token_indexer = PretrainedBertIndexer(
    pretrained_model="./biobert_v1.1_pubmed/vocab.txt",
    do_lowercase=True,
)

# %%
reader = ClaimAnnotationReaderJSON(token_indexers={"tokens": token_indexer},
                                   lazy=True)
merge_reader = MergeDatasetReader(token_indexers={"tokens": token_indexer},
                                  lazy=True)

# train_dataset = reader.read(TRAIN_PATH)
train_dataset = merge_reader.read(COMBINED_TRAIN_PATH)
validation_dataset = reader.read(VALIDATION_PATH)
test_dataset = reader.read(TEST_PATH)
# %%
vocab = Vocabulary()
Ejemplo n.º 24
0
        fileds["label"] = label_field
        return Instance(fileds)


# def tokenizer(x: str):
#     return [w.text for w in SpacyWordSplitter(language='en_core_web_sm', pos_tags=False).split_words(x)[: config.max_seq_len]]
# special tokenizer for bert
def tokenizer(s: str):
    return token_indexer.wordpiece_tokenizer(s)[:config.max_seq_len - 2]


# read data
# token_indexer = SingleIdTokenIndexer()
token_indexer = PretrainedBertIndexer(
    pretrained_model="bert-base-uncased",
    max_pieces=config.max_seq_len,
    do_lowercase=True,
 )
reader = JigsawDatasetReader(tokenizer=tokenizer, token_indexers={"tokens": token_indexer})
DATA_ROOT = Path("data") / "jigsaw"
train_ds, test_ds = (reader.read(DATA_ROOT / fname) for fname in ["train.csv", "test_proced.csv"])
val_ds = None

# prepare vocab
# vocab = Vocabulary.from_instances(train_ds, max_vocab_size=config.max_vocab_size)
vocab = Vocabulary()

# prepare iterator
iterator = BucketIterator(batch_size=config.batch_size, sorting_keys=[("tokens", "num_tokens")])
iterator.index_with(vocab)
Ejemplo n.º 25
0
model: torch.nn.Module = SentencePairModel(
    extractor=CoupledSentencePairFeatureExtractor(
        joiner=BERTConcatenator(),
        encoder=BertSeq2VecEncoderForPairs.from_pretrained(
            "bert-base-uncased")),
    mlp=torch.nn.Sequential(torch.nn.Linear(768, 1), torch.nn.Sigmoid()),
    loss_func=torch.nn.BCELoss(),
    mode="regression")
model.cuda()

if ARGS.pretrained != "":
    model.load_state_dict(torch.load(ARGS.pretrained))

reader = QRelsPointwiseReader(
    lazy=True,
    token_indexers={"wordpiece": PretrainedBertIndexer("bert-base-uncased")},
    left_tokenizer=BertTokenizer(),
    right_tokenizer=BertTokenizer())
iterator = BasicIterator()
iterator.index_with(vocab)

trainer = Trainer(model=model,
                  optimizer=Adam(params=model.parameters(), lr=0.00001),
                  grad_norm=1.0,
                  train_dataset=reader.read(f"{ARGS.data}/train"),
                  validation_dataset=reader.read(f"{ARGS.data}/dev"),
                  iterator=iterator,
                  validation_metric="+pearson",
                  num_epochs=3,
                  patience=3,
                  serialization_dir=ARGS.out,
# In[6]:

from pytorch_pretrained_bert import BertConfig, BertForMaskedLM
masked_lm = BertForMaskedLM.from_pretrained(config.model_type)
masked_lm.eval()

# In[ ]:

# In[7]:

from allennlp.data import Token
from allennlp.data.token_indexers import PretrainedBertIndexer

token_indexer = PretrainedBertIndexer(
    pretrained_model=config.model_type,
    max_pieces=config.max_seq_len,
    do_lowercase=True,
)
#     if len(toks) < config.max_seq_len:
#         return toks + (["[PAD]"] * (maxlen - len(toks)))
#     else:


def tokenizer(s: str):
    maxlen = config.max_seq_len - 2
    toks = token_indexer.wordpiece_tokenizer(s)[:maxlen]
    return toks


# In[8]:
Ejemplo n.º 27
0
def main():
    parser = argparse.ArgumentParser(description='Evidence Inference experiments')
    parser.add_argument('--cuda_device', type=int, default=0,
                        help='GPU number (default: 0)')
    parser.add_argument('--epochs', type=int, default=2,
                        help='upper epoch limit (default: 2)')
    parser.add_argument('--patience', type=int, default=1,
                        help='trainer patience  (default: 1)')
    parser.add_argument('--batch_size', type=int, default=32,
                        help='batch size (default: 32)')
    parser.add_argument('--dropout', type=float, default=0.2,
                        help='dropout for the model (default: 0.2)')
    parser.add_argument('--model_name', type=str, default='baseline',
                        help='model name (default: baseline)')
    parser.add_argument('--tunable', action='store_true',
                        help='tune the underlying embedding model (default: False)')
    args = parser.parse_args()

    annotations = pd.read_csv('data/data/annotations_merged.csv')
    prompts = pd.read_csv('data/data/prompts_merged.csv')

    feature_dictionary = {}
    prompts_dictionary = {}

    for index, row in prompts.iterrows():
        prompts_dictionary[row['PromptID']] = [row['Outcome'], row['Intervention'], row['Comparator']]

    for index, row in annotations.iterrows():
        if row['PMCID'] not in feature_dictionary:
            feature_dictionary[row['PMCID']] = []
        feature_dictionary[row['PMCID']].append([row['Annotations'], row['Label']]
                                                + prompts_dictionary[row['PromptID']])

    train = []
    valid = []
    test = []

    with open('data/splits/train_article_ids.txt') as train_file:
        for line in train_file:
            train.append(int(line.strip()))

    with open('data/splits/validation_article_ids.txt') as valid_file:
        for line in valid_file:
            valid.append(int(line.strip()))

    with open('data/splits/test_article_ids.txt') as test_file:
        for line in test_file:
            test.append(int(line.strip()))

    bert_token_indexer = {'bert': PretrainedBertIndexer('scibert/vocab.txt', max_pieces=512)}

    reader = EIDatasetReader(bert_token_indexer, feature_dictionary)
    train_data = reader.read(train)
    valid_data = reader.read(valid)
    test_data = reader.read(test)

    vocab = Vocabulary.from_instances(train_data + valid_data + test_data)

    bert_token_embedding = PretrainedBertEmbedder(
        'scibert/weights.tar.gz', requires_grad=args.tunable
    )

    word_embeddings = BasicTextFieldEmbedder(
        {"bert": bert_token_embedding},
        {"bert": ['bert']},
        allow_unmatched_keys=True
    )

    model = Baseline(word_embeddings, vocab)

    cuda_device = args.cuda_device

    if torch.cuda.is_available():
        model = model.cuda(cuda_device)
    else:
        cuda_device = -1

    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    iterator = BucketIterator(batch_size=args.batch_size,
                              sorting_keys=[('intervention', 'num_tokens')],
                              padding_noise=0.1)
    iterator.index_with(vocab)

    serialization_dir = 'model_checkpoints/' + args.model_name

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_data,
                      validation_dataset=test_data,
                      patience=args.patience,
                      validation_metric='+accuracy',
                      num_epochs=args.epochs,
                      cuda_device=cuda_device,
                      serialization_dir=serialization_dir)

    result = trainer.train()
    for key in result:
        print(str(key) + ': ' + str(result[key]))

    test_metrics = evaluate(trainer.model, test_data, iterator,
                            cuda_device=cuda_device,
                            batch_weight_key="")

    print('Test Data statistics:')
    for key, value in test_metrics.items():
        print(str(key) + ': ' + str(value))
Ejemplo n.º 28
0
    HIDDEN_DIM = 256
    PRINT_EVERY = 1000
    EVALUATE_EVERY_EPOCH = 1
    ENCODER_LAYER = 2
    DROPOUT_RATE = 0.1
    BATCH_SIZE = 32
    INIT_LEARNING_RATE = 0.0
    EPOCH = 1000
    WARMUP_STEPS = 5000
    PATIENCE = 10
    BERT = False

    torch.manual_seed(1)

    if BERT:
        token_indexer = PretrainedBertIndexer(
            pretrained_model="bert-base-uncased", do_lowercase=True)
        reader = DisfluencyDatasetReader(
            token_indexers={"tokens": token_indexer})
    else:
        reader = DisfluencyDatasetReader()

    train_dataset = reader.read('../train.txt')
    validation_dataset = reader.read('../val.txt')
    test_dataset = reader.read('../test.txt')

    vocab = Vocabulary.from_instances(train_dataset + validation_dataset +
                                      test_dataset)

    if BERT:
        bert_embedder = PretrainedBertEmbedder(
            pretrained_model="bert-base-uncased",
Ejemplo n.º 29
0
def run_model(name, context, conf, double_input, use_elmo=False, save_predictions=False, save_model=False):
    """
    Runs the given model 'name' for the given 'context' and agreement level 'conf'. If double_input is True, runs the combined model using context comment text. Optionally saves the trained model & its vocabulary, and predictions.
    Allowed names: lstm | bilstm | stacked_bilstm | cnn | dense_lstm | dense_bilstm | dense_stacked_bilstm | dense_cnn | nli_cnn | bert | dense_bert
    
    If use_elmo=True, uses ELMo's pre-trained language model for embeddings.

    """
    if use_elmo:
        token_indexer = ELMoTokenCharactersIndexer() # token indexer is responsible for mapping tokens to integers: this makes sure that the mapping is consistent with what was used in the original ELMo training.
    elif name == 'bert':
        global bert_token_indexer
        bert_token_indexer = PretrainedBertIndexer(pretrained_model=BERT_MODEL, do_lowercase=True)
    else:
        token_indexer = SingleIdTokenIndexer()

    if name == 'bert': # BERT uses a special wordpiece tokenizer
        reader = data_reader.UnpalatableDatasetReader(main_input=context, additional_context=double_input,
                                                      tokenizer=tokenizer_bert, token_indexers={"tokens": bert_token_indexer},
                                                      label_cols=LABEL_COLS)
    else:
        reader = data_reader.UnpalatableDatasetReader(main_input=context, additional_context=double_input, tokenizer=tokenizer,
                                                      token_indexers={"tokens": token_indexer}, label_cols=LABEL_COLS)


    map_reply_id_pred_probability = {}; n_epochs = []
    f1s, AUROCs, weighted_f1s, precision_s, recall_s, accuracies, AUPRCs = [], [], [], [], [], [], []

    for fold_number in range(1,6): # 5-fold cross validation
        train_fname = 'train_data_fold_'+str(fold_number)+'_OneHot.csv'
        val_fname = 'val_data_fold_'+str(fold_number)+'_OneHot.csv'
        test_fname = 'test_data_fold_'+str(fold_number)+'_OneHot.csv'

        train_dataset = reader.read(file_path=DATA_ROOT / conf / train_fname)
        validation_dataset = reader.read(file_path=DATA_ROOT / conf / val_fname)
        test_dataset = reader.read(file_path=DATA_ROOT / conf / test_fname)
        print("\n#####################################################\n", double_input, context, conf, name, len(train_dataset), len(validation_dataset), len(test_dataset))

        # Train model:
        if name == 'lstm':
            model, vocab, ep = train.train_lstm(train_dataset, validation_dataset, BATCH_SIZE, dense_vector=False,
                                                num_layers=1, bidirectional=False, use_elmo=use_elmo, double_input=double_input)
        elif name == 'dense_lstm':
            model, vocab, ep = train.train_lstm(train_dataset, validation_dataset, BATCH_SIZE, dense_vector=True,
                                                col_name=context, num_layers=1, bidirectional=False, use_elmo=use_elmo,
                                                double_input=double_input)
        elif name == 'bilstm':
            model, vocab, ep = train.train_lstm(train_dataset, validation_dataset, BATCH_SIZE, dense_vector=False,
                                                num_layers=1, bidirectional=True, use_elmo=use_elmo, double_input=double_input)
        elif name == 'dense_bilstm':
            model, vocab, ep = train.train_lstm(train_dataset, validation_dataset, BATCH_SIZE, dense_vector=True,
                                                col_name=context, num_layers=1, bidirectional=True, use_elmo=use_elmo,
                                                double_input=double_input)
        elif name == 'stacked_bilstm':
            model, vocab, ep = train.train_lstm(train_dataset, validation_dataset, BATCH_SIZE, dense_vector=False,
                                                num_layers=2, bidirectional=True, use_elmo=use_elmo, double_input=double_input)
        elif name == 'dense_stacked_bilstm':
            model, vocab, ep = train.train_lstm(train_dataset, validation_dataset, BATCH_SIZE, dense_vector=True,
                                                col_name=context, num_layers=2, bidirectional=True, use_elmo=use_elmo,
                                                double_input=double_input)
        elif name == 'cnn':
            if context == 'reply_text': filter_sizes = (2,3) # kernels can not be bigger than the shortest sentence
            else: filter_sizes = (2,)
            model, vocab, ep = train.train_cnn(train_dataset, validation_dataset, BATCH_SIZE, dense_vector=False,
                                               num_filters=100, filter_sizes=filter_sizes, use_elmo=use_elmo,
                                               double_input=double_input)
        elif name == 'dense_cnn':
            if context == 'reply_text': filter_sizes = (2,3) # kernels can not be bigger than the shortest sentence
            else: filter_sizes = (2,)
            model, vocab, ep = train.train_cnn(train_dataset, validation_dataset, BATCH_SIZE, dense_vector=True,
                                               col_name=context, num_filters=100, filter_sizes=filter_sizes, use_elmo=use_elmo,
                                               double_input=double_input)      
        elif name == 'nli_cnn':
            if double_input == False:
                print("Error: NLI-inspired architecture only accepts double-input.")
                return [None]*9
            filter_sizes = (2,3)
            model, vocab, ep = train.train_nli(train_dataset, validation_dataset, BATCH_SIZE, use_elmo=use_elmo,
                                               num_filters=100, filter_sizes=filter_sizes)
        elif name == 'bert':
            model, vocab, ep = train.train_bert(train_dataset, validation_dataset, BATCH_SIZE, pretrained_model=BERT_MODEL,
                                                dense_vector=False, double_input=double_input)
        elif name == 'dense_bert':
            model, vocab, ep = train.train_bert(train_dataset, validation_dataset, BATCH_SIZE, pretrained_model=BERT_MODEL, 
                                                dense_vector=True, col_name=context, double_input=double_input)
        else:
            sys.exit("'name' not valid")
            
        n_epochs.append(ep) # keep track of number of actual training epochs for each fold
        
        # Predict and evaluate model on test set:
        preds = evaluate.make_predictions(model, vocab, test_dataset, BATCH_SIZE, use_gpu=False) # NOTE: preds is of shape (number of samples, 2) - the columns represent the probabilities for the two classes in order ['yes_unp', 'not_unp']
        f1, auroc, w_f1, precision, recall, acc, auprc = evaluate.compute_metrics(preds, test_dataset)
        
        if save_predictions: # save predictions for error analysis
            replyid_pred = evaluate.map_id_prediction(preds, test_dataset)
            if set(replyid_pred.keys()).intersection(set(map_reply_id_pred_probability.keys())) != set(): # sanity check
                sys.exit("Error: There is overlap in Test IDs across folds.")
            map_reply_id_pred_probability.update(replyid_pred)
        
        if save_model: # save the model weights and vocabulary
            with open('./tmp/'+name+'_model_conf_'+conf.split('-')[1]+'_fold_'+str(fold_number)+'.th', 'wb') as f:
                torch.save(model.state_dict(), f)
            vocab.save_to_files("./tmp/"+name+"_vocabulary_"+conf.split('-')[1]+"_fold_"+str(fold_number))

        print("\nFold #{} | F1 = {} | AUROC = {} | AUPRC = {}".format(fold_number, f1, auroc, auprc))

        f1s.append(f1); AUROCs.append(auroc); weighted_f1s.append(w_f1); precision_s.append(precision); 
        recall_s.append(recall); accuracies.append(acc); AUPRCs.append(auprc)

    mean_f1 = np.array(f1s).mean(); mean_auroc = np.array(AUROCs).mean(); mean_weighted_f1 = np.array(weighted_f1s).mean(); 
    mean_precision = np.array(precision_s).mean(); mean_recall = np.array(recall_s).mean(); mean_accuracy = np.array(accuracies).mean(); mean_auprc = np.array(AUPRCs).mean()

    print("Total predictions: {} | Save Predictions: {}".format(len(map_reply_id_pred_probability), save_predictions))
    
    return mean_f1, mean_auroc, mean_weighted_f1, mean_precision, mean_recall, mean_accuracy, mean_auprc, map_reply_id_pred_probability, n_epochs
Ejemplo n.º 30
0
def multiprocess_single_sequence_loader(process_number: int, _config,
                                        _queue: mp.Queue,
                                        _wait_for_exit: mp.Event, _local_file,
                                        _fasttext_vocab_cached_mapping,
                                        _fasttext_vocab_cached_data):

    torch.manual_seed(_config["random_seed"])
    numpy.random.seed(_config["random_seed"])
    random.seed(_config["random_seed"])

    if _config["token_embedder_type"] == "bert_cls":
        _tokenizer = BlingFireTokenizer()
        _ind = PretrainedBertIndexer(
            pretrained_model=_config["bert_pretrained_model"],
            do_lowercase=True)
        _token_indexers = {"tokens": _ind}

        _tuple_loader = IrSingleSequenceDatasetReader(
            lazy=True,
            tokenizer=_tokenizer,
            token_indexers=_token_indexers,
            max_seq_length=_config["max_doc_length"],
            min_seq_length=_config["min_doc_length"],
        )

        _iterator = BucketIterator(batch_size=int(_config["batch_size_eval"]),
                                   sorting_keys=[("seq_tokens", "num_tokens")])

        _iterator.index_with(Vocabulary.from_files(_config["vocab_directory"]))

    else:
        _tokenizer = BlingFireTokenizer()

        if _config["token_embedder_type"] == "embedding":
            _token_indexers = {
                "tokens": SingleIdTokenIndexer(lowercase_tokens=True)
            }
            _vocab = Vocabulary.from_files(_config["vocab_directory"])

        elif _config["token_embedder_type"] == "fasttext":
            _token_indexers = {
                "tokens":
                FastTextNGramIndexer(_config["fasttext_max_subwords"])
            }
            _vocab = FastTextVocab(_fasttext_vocab_cached_mapping,
                                   _fasttext_vocab_cached_data,
                                   _config["fasttext_max_subwords"])

        elif _config["token_embedder_type"] == "elmo":
            _token_indexers = {"tokens": ELMoTokenCharactersIndexer()}
            _vocab = None

        _tuple_loader = IrSingleSequenceDatasetReader(
            lazy=True,
            tokenizer=_tokenizer,
            token_indexers=_token_indexers,
            max_seq_length=_config["max_doc_length"],
            min_seq_length=_config["min_doc_length"],
        )

        _iterator = BucketIterator(batch_size=int(_config["batch_size_eval"]),
                                   sorting_keys=[("seq_tokens", "num_tokens")])

        _iterator.index_with(_vocab)

    for training_batch in _iterator(_tuple_loader.read(_local_file),
                                    num_epochs=1):

        _queue.put(
            training_batch)  # this moves the tensors in to shared memory

    _queue.put(None)  # signal end of queue

    _queue.close()  # indicate this local thread is done
    _wait_for_exit.wait(
    )  # keep this process alive until all the shared memory is used and not needed anymore