def get_embedder_info( embedder_type: str ) -> Tuple[TokenEmbedder, TokenIndexer, str, Dict[str, Any]]: embedder_type = embedder_type.lower() text_field_embedder_kwargs: Dict[str, Any] = {} if embedder_type == 'ner_elmo': return NERElmoTokenEmbedder(), ELMoTokenCharactersIndexer( ), text_field_embedder_kwargs elif embedder_type == 'elmo': return ElmoTokenEmbedder(ELMO_OPTIONS_FILE, ELMO_WEIGHT_FILE), ELMoTokenCharactersIndexer( ), text_field_embedder_kwargs elif embedder_type == 'bert': bert_embedder = PretrainedBertEmbedder( pretrained_model="bert-base-uncased", top_layer_only=True, # conserve memory ) token_indexer = PretrainedBertIndexer( pretrained_model="bert-base-uncased", max_pieces=512, # max pieces allowed for positional embeddings do_lowercase=True, use_starting_offsets=True, ) text_field_embedder_kwargs['allow_unmatched_keys'] = True text_field_embedder_kwargs['embedder_to_indexer_map'] = { "tokens": ["tokens", "tokens-offsets"] } return bert_embedder, token_indexer, text_field_embedder_kwargs else: raise Exception(f'Unknown embedder type: {embedder_type}')
def get_token_utils(name: str = config.embedder): if name == 'elmo': from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter from allennlp.data.token_indexers.elmo_indexer import ELMoCharacterMapper, ELMoTokenCharactersIndexer # the token indexer is responsible for mapping tokens to integers token_indexer = ELMoTokenCharactersIndexer() def tokenizer(x: str): return [ w.text for w in SpacyWordSplitter(language='en_core_web_sm', pos_tags=False).split_words( x)[:config.max_seq_len] ] return token_indexer, tokenizer elif name == 'bert': from allennlp.data.token_indexers import PretrainedBertIndexer token_indexer = PretrainedBertIndexer( pretrained_model="bert-base-uncased", max_pieces=config.max_seq_len, do_lowercase=True, ) def tokenizer(s: str): return token_indexer.wordpiece_tokenizer(s)[:config.max_seq_len - 2] return token_indexer, tokenizer
def load_data(train_dir, test_dir): token_indexer = PretrainedBertIndexer( pretrained_model="bert-base-uncased", max_pieces=config.max_seq_len, do_lowercase=True, ) reader = LoadData(tokenizer=tokenizer, token_indexers={"tokens": token_indexer}, col_name=config.col_name) train_data = pd.read_csv(train_dir) test_data = pd.read_csv(test_dir) test_y = test_data[config.col_name[1]].tolist() train_data, val_data = train_test_split(train_data, test_size=0.1, random_state=42) train_data = reader.read(train_data) val_data = reader.read(val_data) test_data = reader.read(test_data) return train_data, val_data, test_data, test_y
def get_bert_token_indexers( path_to_bert_weights: str, maximum_number_of_tokens: int, is_model_lowercase: bool = True) -> PretrainedBertIndexer: """ Retrieving bert based token indexers (which will do sub-word tokenizing) Parameters ---------- path_to_bert_weights: `str`, required The path to the pytorch bert weights maximum_number_of_tokens: `int`, required The maximum number of tokens (truncation or sliding window based on allennlp design) is_model_lowercase: `bool` optional (default=True) Force lower casing the input to the model Returns ---------- Returns the required instance of :class:`PretrainedBertIndexer`. """ token_indexers = PretrainedBertIndexer(pretrained_model=os.path.abspath( os.path.join(path_to_bert_weights, 'vocab.txt')), max_pieces=maximum_number_of_tokens, do_lowercase=is_model_lowercase, use_starting_offsets=True) return token_indexers
def get_dataset_reader(): """ Get dataset reader instance """ # Fixed configuration config = Config( use_percentage=1.0, # how much data to use max_seq_len=512, # necessary to limit memory usage use_extracted_emb=False, # set False since we are using BERT batch_size=32 # for testing, this does not matter ) # BERT wordpiece tokenizer token_indexer = PretrainedBertIndexer( pretrained_model="bert-base-multilingual-cased", # recommended by Google max_pieces=config.max_seq_len, do_lowercase=False, ) def tokenizer(s: str, max_seq_len: int=config.max_seq_len) -> List[str]: return [Token(x) for x in token_indexer.wordpiece_tokenizer(s)[:max_seq_len]] # dataset reader reader = PairDatasetReaderSpecial( tokenizer=tokenizer, token_indexers={"tokens": token_indexer}, use_percentage=config.use_percentage, use_extracted_emb=config.use_extracted_emb # set False if finetuning ) return config, reader
def __init__(self, word_indexer: Optional[TokenIndexer] = None, is_bert: bool = False, conceptnet_path: Optional[Path] = None): super().__init__(lazy=False) self.pos_indexers = {"pos_tokens": PosTagIndexer()} self.ner_indexers = {"ner_tokens": NerTagIndexer()} self.rel_indexers = { "rel_tokens": SingleIdTokenIndexer(namespace='rel_tokens') } if is_bert: splitter = BertBasicWordSplitter() else: splitter = SpacyWordSplitter() self.tokeniser = WordTokenizer(word_splitter=splitter) self.word_indexers = {'tokens': word_indexer} word_splitter = SpacyWordSplitter(pos_tags=True, ner=True, parse=True) self.word_tokeniser = WordTokenizer(word_splitter=word_splitter) bert_splitter = BertBasicWordSplitter() self.bert_tokeniser = WordTokenizer(word_splitter=bert_splitter) if word_indexer is None: if is_bert: word_indexer = PretrainedBertIndexer( pretrained_model='bert-base-uncased', truncate_long_sequences=False) else: word_indexer = SingleIdTokenIndexer(lowercase_tokens=True) self.word_indexers = {'tokens': word_indexer} self.conceptnet = ConceptNet(conceptnet_path=conceptnet_path)
def __init__(self, lazy: bool = False, question_token_indexers: Dict[str, TokenIndexer] = None, keep_if_unparsable: bool = True, tables_file: str = None, dataset_path: str = 'dataset/database', load_cache: bool = True, save_cache: bool = True, loading_limit=-1): super().__init__(lazy=lazy) # default spacy tokenizer splits the common token 'id' to ['i', 'd'], we here write a manual fix for that token_indexer = PretrainedBertIndexer( pretrained_model="bert-base-uncased", do_lowercase=True, use_starting_offsets=True) self._tokenizer = Bert_Tokenizer(token_indexer.wordpiece_tokenizer) self._utterance_token_indexers = question_token_indexers or { 'tokens': token_indexer } self._keep_if_unparsable = keep_if_unparsable self._tables_file = tables_file self._dataset_path = dataset_path self._load_cache = load_cache self._save_cache = save_cache self._loading_limit = loading_limit
def predict(vocab2): bert_token_indexer = PretrainedBertIndexer( pretrained_model="bert-large-uncased", max_pieces=config.max_seq_len, do_lowercase=True, ) reader = BertAnalogyDatasetReader( tokenizer=bert_tokenizer, token_indexers={'tokens':bert_token_indexer} ) train_dataset, test_dataset, dev_dataset = (reader.read(DATA_ROOT + "/" + fname) for fname in ["train_all.txt", "test_all.txt", "val_all.txt"]) bert_embedder = PretrainedBertEmbedder( pretrained_model='bert-large-uncased', top_layer_only=True, # conserve memory ) word_embeddings: TextFieldEmbedder = BasicTextFieldEmbedder({"tokens": bert_embedder}, # we'll be ignoring masks so we'll need to set this to True allow_unmatched_keys = True) BERT_DIM = word_embeddings.get_output_dim() class BertSentencePooler(Seq2VecEncoder): def forward(self, embs: torch.tensor, mask: torch.tensor=None) -> torch.tensor: # extract first token tensor return embs[:, 0] @overrides def get_output_dim(self) -> int: return BERT_DIM # if not vocab2: # vocab2 = Vocabulary.from_files("./bert_vocabulary") bert_encoder = BertSentencePooler(vocab2) model2 = LstmModel(word_embeddings, bert_encoder, vocab2) if USE_GPU: model2.cuda() else: model2 with open("./bert_model.th", 'rb') as f: model2.load_state_dict(torch.load(f)) predictor2 = SentenceClassifierPredictor(model2, dataset_reader=reader) with open('bert_predictions.txt', 'w+') as f: top_10_words_list = [] for analogy_test in test_dataset: logits = predictor2.predict_instance(analogy_test)['logits'] label_id = np.argmax(logits) label_predict = model2.vocab.get_token_from_index(label_id, 'labels') top_10_ids = np.argsort(logits)[-10:] top_10_words = [model2.vocab.get_token_from_index(id, 'labels') for id in top_10_ids] top_10_words_list.append(top_10_words) f.write(label_predict + "\n") top_10_words_list = np.array(top_10_words_list) print(top_10_words_list.shape) np.save('bert_top_10_words_list.npy', np.array(top_10_words_list))
def __init__(self): super().__init__(lazy=False) self.token_indexers = { "tokens": PretrainedBertIndexer(pretrained_model="bert-base-cased", do_lowercase=False #max_pieces=config.max_seq_length ) }
def __init__(self, model_type: str, max_seq_len: int = 128): self.model_type = model_type self.max_seq_len = max_seq_len self.token_indexer = PretrainedBertIndexer( pretrained_model=self.model_type, max_pieces=self.max_seq_len, do_lowercase=True, ) self.vocab = Vocabulary() self.token_indexer._add_encoding_to_vocabulary(self.vocab) self.full_vocab = {v: k for k, v in self.token_indexer.vocab.items()}
def __init__(self, word_indexer: Optional[TokenIndexer] = None): super().__init__(lazy=False) splitter = BertBasicWordSplitter() self.tokeniser = WordTokenizer(word_splitter=splitter) if word_indexer is None: word_indexer = PretrainedBertIndexer( pretrained_model='bert-base-uncased', truncate_long_sequences=False) self.word_indexers = {'tokens': word_indexer}
def get_indexer(embedding_type: str, xlnet_vocab_file: Path ) -> TokenIndexer: if embedding_type == 'bert': return PretrainedBertIndexer( pretrained_model='bert-base-uncased', truncate_long_sequences=False) if embedding_type == 'glove': return SingleIdTokenIndexer(lowercase_tokens=True) if embedding_type == 'xlnet': return XLNetIndexer(vocab_file=str(xlnet_vocab_file))
def __init__(self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, lazy: bool = False) -> None: super().__init__(lazy) self._tokenizer = tokenizer or JiebaTokenizer( word_splitter=JiebaSplitter()) self._token_indexers = token_indexers or { 'tokens': PretrainedBertIndexer( "./bert/multi_cased_L-12_H-768_A-12/bert-base-multilingual-cased-vocab.txt" ) }
def __init__( self, target_namespace: str, lazy: bool = False, ) -> None: super().__init__(lazy) self._target_namespace = target_namespace self._source_token_indexers: Dict[str, TokenIndexer] = { "bert": PretrainedBertIndexer('bert-base-uncased') } self._target_token_indexers: Dict[str, TokenIndexer] = { "tokens": SingleIdTokenIndexer(namespace=self._target_namespace) }
def load_stackex_data(data_root, assign_id_file, encoder, train_type, max_seq_len, toy_data, use_tokenizer=True, is_rank_task=True): # the token indexer is responsible for mapping tokens to integers def bert_tokenizer(s: str): return token_indexer.wordpiece_tokenizer(s)[:max_seq_len - 2] def normal_tokenizer(x: str): return [ w.text for w in SpacyWordSplitter(language='en_core_web_sm', pos_tags=False).split_words(x) [:max_seq_len] ] if encoder == "bert": token_indexer = PretrainedBertIndexer( pretrained_model="bert-base-uncased", max_pieces=max_seq_len, do_lowercase=True, ) tokenizer = bert_tokenizer else: token_indexer = SingleIdTokenIndexer() tokenizer = normal_tokenizer # init dataset reader reader = StackExDataReader(assignee_id_file=assign_id_file, tokenizer=tokenizer, token_indexers={"tokens": token_indexer}, is_rank_task=is_rank_task, toy_data=toy_data, use_tokenizer=use_tokenizer) if train_type == "seq": train_pkl, val_pkl, test_pkl = "train.pkl", "val.pkl", "test.pkl" else: train_pkl, val_pkl, test_pkl = "train_" + train_type + ".pkl", "val_" + train_type + ".pkl", "test_" + train_type + ".pkl" # elif train_type == "single": # train_pkl, test_pkl = "train_single.pkl", "test_single.pkl" train_ds, val_ds, test_ds = (reader.read(data_root / fname) for fname in [train_pkl, val_pkl, test_pkl]) return reader, train_ds, val_ds, test_ds
def get_pretrained_bert_indexer(model_name: str = 'bert-base-uncased', cache_dir: str = PATH_ALLENNLP_CACHE, max_length=512, lowercase=True, **kwargs): model_path = path.join(cache_dir, 'bert', f'{model_name}-vocab.txt') msgex.assert_path_exist( path_str=model_path, arg_name='model_path', extra_msg=f"the specified BERT model '{model_name}' is not found") return PretrainedBertIndexer(pretrained_model=model_path, max_pieces=max_length, do_lowercase=lowercase, **kwargs)
def get_token_indexer(self, token_indexers): self.token_indexers = token_indexers or { "tokens": SingleIdTokenIndexer() } # the token indexer is responsible for mapping tokens to integers if self.embeddings == 'elmo': self.token_indexers = {"tokens": ELMoTokenCharactersIndexer()} elif self.embeddings == 'bert': self.ber_embedder = PretrainedBertIndexer( pretrained_model="bert-base-uncased", max_pieces=128, do_lowercase=True, ) self.token_indexers = {"bert": self.ber_embedder}
def test_reader_from_file(self): reader = BaselineReader( idiom_vector_path=self.IDIOM_VECTOR_PATH, pretrained_model=self.PRETRAINED_MODEL, content_token_indexer={ "bert": PretrainedBertIndexer( self.PRETRAINED_MODEL ) }, max_seq_length=256 ) instances = reader.read(str(self.FIXTURES_ROOT / "data" / "realcount_3_sample.txt")) instances = ensure_list(instances) assert len(instances) == 3
def __init__(self, lazy: bool = False, load_cache: bool = True, save_cache: bool = True, loading_ratio: float = 1.0, enable_unparse: bool = True, use_bert: bool = False, # insert before/after/both super_mode: str = 'before', word_level: bool = False, # if joint encoding, means that joint_encoding: bool = False, language: str = 'zh', extra_stop_words: List = None): super().__init__(lazy=lazy) # different languages share the same word splitter self._tokenizer = WordTokenizer(JustSpacesWordSplitter()) if use_bert: pretrain_model_name = 'bert-base-chinese' if language == 'zh' else 'bert-base-uncased' self._indexer = {'bert': PretrainedBertIndexer(pretrained_model=pretrain_model_name, use_starting_offsets=False, do_lowercase=True, never_lowercase=['[UNK]', '[PAD]', '[CLS]', '[SEP]'], truncate_long_sequences=False)} else: self._indexer = {'tokens': SingleIdTokenIndexer(namespace='tokens')} # loading ratio is designed for hyper-parameter fine-tuning self._loading_ratio = loading_ratio self._load_cache = load_cache self._save_cache = save_cache self._use_bert = use_bert self._language = language self._word_level = word_level self._super_mode = super_mode self._enable_unparse = enable_unparse self._joint_encoding = joint_encoding self._extra_stop_words = extra_stop_words
def __init__(self, is_bert: bool, conceptnet_path: Path, word_indexer: Optional[TokenIndexer] = None): super().__init__(lazy=False) if is_bert: splitter = BertBasicWordSplitter() else: splitter = SpacyWordSplitter() self.tokeniser = WordTokenizer(word_splitter=splitter) if word_indexer is None: if is_bert: word_indexer = PretrainedBertIndexer( pretrained_model='bert-base-uncased', truncate_long_sequences=True) else: word_indexer = SingleIdTokenIndexer(lowercase_tokens=True) self.word_indexers = {'tokens': word_indexer} # self.rel_indexers = { # "rel_tokens": SingleIdTokenIndexer(namespace='rel_tokens')} self.conceptnet = ConceptNet(conceptnet_path=conceptnet_path)
assert(len(sent_embeddings) == len(rel_labels)) return sentences, sent_embeddings, rel_labels """ Example """ if __name__ == "__main__": working_dir = "../data/ICNALE/original/pairwise_link_labelling/test/" # BERT wordpiece tokenizer max_seq_len = 512 # maximum number of tokens when tokenizing a text token_indexer = PretrainedBertIndexer( pretrained_model="bert-base-multilingual-cased", # recommended by Google max_pieces=max_seq_len, do_lowercase=False, ) def tokenizer(s: str, max_seq_len: int=512) -> List[str]: return [Token(x) for x in token_indexer.wordpiece_tokenizer(s)[:max_seq_len]] # reading dataset reader = PairDatasetReader( tokenizer=None, # None if you do not want to tokenize token_indexers={"tokens": token_indexer}, use_percentage=0.1, use_extracted_emb=False # set False if finetuning ) train_ds = reader.read(working_dir)
TextField(tk, self._token_indexers) for tk in aug_unsup_tokenized_sents ]) fields['ori_unsup_sentences'] = ori_unsup_sentence_sequence fields['aug_unsup_sentences'] = aug_unsup_sentence_sequence # Fake data # fields['ori_unsup_sentences'] = sentence_sequence # fields['aug_unsup_sentences'] = sentence_sequence return Instance(fields) # %% token_indexer = PretrainedBertIndexer( pretrained_model="./biobert_v1.1_pubmed/vocab.txt", do_lowercase=True, ) # %% reader = ClaimAnnotationReaderJSON(token_indexers={"tokens": token_indexer}, lazy=True) merge_reader = MergeDatasetReader(token_indexers={"tokens": token_indexer}, lazy=True) # train_dataset = reader.read(TRAIN_PATH) train_dataset = merge_reader.read(COMBINED_TRAIN_PATH) validation_dataset = reader.read(VALIDATION_PATH) test_dataset = reader.read(TEST_PATH) # %% vocab = Vocabulary()
fileds["label"] = label_field return Instance(fileds) # def tokenizer(x: str): # return [w.text for w in SpacyWordSplitter(language='en_core_web_sm', pos_tags=False).split_words(x)[: config.max_seq_len]] # special tokenizer for bert def tokenizer(s: str): return token_indexer.wordpiece_tokenizer(s)[:config.max_seq_len - 2] # read data # token_indexer = SingleIdTokenIndexer() token_indexer = PretrainedBertIndexer( pretrained_model="bert-base-uncased", max_pieces=config.max_seq_len, do_lowercase=True, ) reader = JigsawDatasetReader(tokenizer=tokenizer, token_indexers={"tokens": token_indexer}) DATA_ROOT = Path("data") / "jigsaw" train_ds, test_ds = (reader.read(DATA_ROOT / fname) for fname in ["train.csv", "test_proced.csv"]) val_ds = None # prepare vocab # vocab = Vocabulary.from_instances(train_ds, max_vocab_size=config.max_vocab_size) vocab = Vocabulary() # prepare iterator iterator = BucketIterator(batch_size=config.batch_size, sorting_keys=[("tokens", "num_tokens")]) iterator.index_with(vocab)
model: torch.nn.Module = SentencePairModel( extractor=CoupledSentencePairFeatureExtractor( joiner=BERTConcatenator(), encoder=BertSeq2VecEncoderForPairs.from_pretrained( "bert-base-uncased")), mlp=torch.nn.Sequential(torch.nn.Linear(768, 1), torch.nn.Sigmoid()), loss_func=torch.nn.BCELoss(), mode="regression") model.cuda() if ARGS.pretrained != "": model.load_state_dict(torch.load(ARGS.pretrained)) reader = QRelsPointwiseReader( lazy=True, token_indexers={"wordpiece": PretrainedBertIndexer("bert-base-uncased")}, left_tokenizer=BertTokenizer(), right_tokenizer=BertTokenizer()) iterator = BasicIterator() iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=Adam(params=model.parameters(), lr=0.00001), grad_norm=1.0, train_dataset=reader.read(f"{ARGS.data}/train"), validation_dataset=reader.read(f"{ARGS.data}/dev"), iterator=iterator, validation_metric="+pearson", num_epochs=3, patience=3, serialization_dir=ARGS.out,
# In[6]: from pytorch_pretrained_bert import BertConfig, BertForMaskedLM masked_lm = BertForMaskedLM.from_pretrained(config.model_type) masked_lm.eval() # In[ ]: # In[7]: from allennlp.data import Token from allennlp.data.token_indexers import PretrainedBertIndexer token_indexer = PretrainedBertIndexer( pretrained_model=config.model_type, max_pieces=config.max_seq_len, do_lowercase=True, ) # if len(toks) < config.max_seq_len: # return toks + (["[PAD]"] * (maxlen - len(toks))) # else: def tokenizer(s: str): maxlen = config.max_seq_len - 2 toks = token_indexer.wordpiece_tokenizer(s)[:maxlen] return toks # In[8]:
def main(): parser = argparse.ArgumentParser(description='Evidence Inference experiments') parser.add_argument('--cuda_device', type=int, default=0, help='GPU number (default: 0)') parser.add_argument('--epochs', type=int, default=2, help='upper epoch limit (default: 2)') parser.add_argument('--patience', type=int, default=1, help='trainer patience (default: 1)') parser.add_argument('--batch_size', type=int, default=32, help='batch size (default: 32)') parser.add_argument('--dropout', type=float, default=0.2, help='dropout for the model (default: 0.2)') parser.add_argument('--model_name', type=str, default='baseline', help='model name (default: baseline)') parser.add_argument('--tunable', action='store_true', help='tune the underlying embedding model (default: False)') args = parser.parse_args() annotations = pd.read_csv('data/data/annotations_merged.csv') prompts = pd.read_csv('data/data/prompts_merged.csv') feature_dictionary = {} prompts_dictionary = {} for index, row in prompts.iterrows(): prompts_dictionary[row['PromptID']] = [row['Outcome'], row['Intervention'], row['Comparator']] for index, row in annotations.iterrows(): if row['PMCID'] not in feature_dictionary: feature_dictionary[row['PMCID']] = [] feature_dictionary[row['PMCID']].append([row['Annotations'], row['Label']] + prompts_dictionary[row['PromptID']]) train = [] valid = [] test = [] with open('data/splits/train_article_ids.txt') as train_file: for line in train_file: train.append(int(line.strip())) with open('data/splits/validation_article_ids.txt') as valid_file: for line in valid_file: valid.append(int(line.strip())) with open('data/splits/test_article_ids.txt') as test_file: for line in test_file: test.append(int(line.strip())) bert_token_indexer = {'bert': PretrainedBertIndexer('scibert/vocab.txt', max_pieces=512)} reader = EIDatasetReader(bert_token_indexer, feature_dictionary) train_data = reader.read(train) valid_data = reader.read(valid) test_data = reader.read(test) vocab = Vocabulary.from_instances(train_data + valid_data + test_data) bert_token_embedding = PretrainedBertEmbedder( 'scibert/weights.tar.gz', requires_grad=args.tunable ) word_embeddings = BasicTextFieldEmbedder( {"bert": bert_token_embedding}, {"bert": ['bert']}, allow_unmatched_keys=True ) model = Baseline(word_embeddings, vocab) cuda_device = args.cuda_device if torch.cuda.is_available(): model = model.cuda(cuda_device) else: cuda_device = -1 optimizer = torch.optim.Adam(model.parameters(), lr=0.001) iterator = BucketIterator(batch_size=args.batch_size, sorting_keys=[('intervention', 'num_tokens')], padding_noise=0.1) iterator.index_with(vocab) serialization_dir = 'model_checkpoints/' + args.model_name trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_data, validation_dataset=test_data, patience=args.patience, validation_metric='+accuracy', num_epochs=args.epochs, cuda_device=cuda_device, serialization_dir=serialization_dir) result = trainer.train() for key in result: print(str(key) + ': ' + str(result[key])) test_metrics = evaluate(trainer.model, test_data, iterator, cuda_device=cuda_device, batch_weight_key="") print('Test Data statistics:') for key, value in test_metrics.items(): print(str(key) + ': ' + str(value))
HIDDEN_DIM = 256 PRINT_EVERY = 1000 EVALUATE_EVERY_EPOCH = 1 ENCODER_LAYER = 2 DROPOUT_RATE = 0.1 BATCH_SIZE = 32 INIT_LEARNING_RATE = 0.0 EPOCH = 1000 WARMUP_STEPS = 5000 PATIENCE = 10 BERT = False torch.manual_seed(1) if BERT: token_indexer = PretrainedBertIndexer( pretrained_model="bert-base-uncased", do_lowercase=True) reader = DisfluencyDatasetReader( token_indexers={"tokens": token_indexer}) else: reader = DisfluencyDatasetReader() train_dataset = reader.read('../train.txt') validation_dataset = reader.read('../val.txt') test_dataset = reader.read('../test.txt') vocab = Vocabulary.from_instances(train_dataset + validation_dataset + test_dataset) if BERT: bert_embedder = PretrainedBertEmbedder( pretrained_model="bert-base-uncased",
from allennlp.data.token_indexers import PretrainedBertIndexer BERT_MAX_LENGTH=512 token_indexer = PretrainedBertIndexer( pretrained_model="bert-base-uncased", max_pieces=BERT_MAX_LENGTH, do_lowercase=True, ) def tokenizer(s: str): return token_indexer.wordpiece_tokenizer(s)
def run_model(name, context, conf, double_input, use_elmo=False, save_predictions=False, save_model=False): """ Runs the given model 'name' for the given 'context' and agreement level 'conf'. If double_input is True, runs the combined model using context comment text. Optionally saves the trained model & its vocabulary, and predictions. Allowed names: lstm | bilstm | stacked_bilstm | cnn | dense_lstm | dense_bilstm | dense_stacked_bilstm | dense_cnn | nli_cnn | bert | dense_bert If use_elmo=True, uses ELMo's pre-trained language model for embeddings. """ if use_elmo: token_indexer = ELMoTokenCharactersIndexer() # token indexer is responsible for mapping tokens to integers: this makes sure that the mapping is consistent with what was used in the original ELMo training. elif name == 'bert': global bert_token_indexer bert_token_indexer = PretrainedBertIndexer(pretrained_model=BERT_MODEL, do_lowercase=True) else: token_indexer = SingleIdTokenIndexer() if name == 'bert': # BERT uses a special wordpiece tokenizer reader = data_reader.UnpalatableDatasetReader(main_input=context, additional_context=double_input, tokenizer=tokenizer_bert, token_indexers={"tokens": bert_token_indexer}, label_cols=LABEL_COLS) else: reader = data_reader.UnpalatableDatasetReader(main_input=context, additional_context=double_input, tokenizer=tokenizer, token_indexers={"tokens": token_indexer}, label_cols=LABEL_COLS) map_reply_id_pred_probability = {}; n_epochs = [] f1s, AUROCs, weighted_f1s, precision_s, recall_s, accuracies, AUPRCs = [], [], [], [], [], [], [] for fold_number in range(1,6): # 5-fold cross validation train_fname = 'train_data_fold_'+str(fold_number)+'_OneHot.csv' val_fname = 'val_data_fold_'+str(fold_number)+'_OneHot.csv' test_fname = 'test_data_fold_'+str(fold_number)+'_OneHot.csv' train_dataset = reader.read(file_path=DATA_ROOT / conf / train_fname) validation_dataset = reader.read(file_path=DATA_ROOT / conf / val_fname) test_dataset = reader.read(file_path=DATA_ROOT / conf / test_fname) print("\n#####################################################\n", double_input, context, conf, name, len(train_dataset), len(validation_dataset), len(test_dataset)) # Train model: if name == 'lstm': model, vocab, ep = train.train_lstm(train_dataset, validation_dataset, BATCH_SIZE, dense_vector=False, num_layers=1, bidirectional=False, use_elmo=use_elmo, double_input=double_input) elif name == 'dense_lstm': model, vocab, ep = train.train_lstm(train_dataset, validation_dataset, BATCH_SIZE, dense_vector=True, col_name=context, num_layers=1, bidirectional=False, use_elmo=use_elmo, double_input=double_input) elif name == 'bilstm': model, vocab, ep = train.train_lstm(train_dataset, validation_dataset, BATCH_SIZE, dense_vector=False, num_layers=1, bidirectional=True, use_elmo=use_elmo, double_input=double_input) elif name == 'dense_bilstm': model, vocab, ep = train.train_lstm(train_dataset, validation_dataset, BATCH_SIZE, dense_vector=True, col_name=context, num_layers=1, bidirectional=True, use_elmo=use_elmo, double_input=double_input) elif name == 'stacked_bilstm': model, vocab, ep = train.train_lstm(train_dataset, validation_dataset, BATCH_SIZE, dense_vector=False, num_layers=2, bidirectional=True, use_elmo=use_elmo, double_input=double_input) elif name == 'dense_stacked_bilstm': model, vocab, ep = train.train_lstm(train_dataset, validation_dataset, BATCH_SIZE, dense_vector=True, col_name=context, num_layers=2, bidirectional=True, use_elmo=use_elmo, double_input=double_input) elif name == 'cnn': if context == 'reply_text': filter_sizes = (2,3) # kernels can not be bigger than the shortest sentence else: filter_sizes = (2,) model, vocab, ep = train.train_cnn(train_dataset, validation_dataset, BATCH_SIZE, dense_vector=False, num_filters=100, filter_sizes=filter_sizes, use_elmo=use_elmo, double_input=double_input) elif name == 'dense_cnn': if context == 'reply_text': filter_sizes = (2,3) # kernels can not be bigger than the shortest sentence else: filter_sizes = (2,) model, vocab, ep = train.train_cnn(train_dataset, validation_dataset, BATCH_SIZE, dense_vector=True, col_name=context, num_filters=100, filter_sizes=filter_sizes, use_elmo=use_elmo, double_input=double_input) elif name == 'nli_cnn': if double_input == False: print("Error: NLI-inspired architecture only accepts double-input.") return [None]*9 filter_sizes = (2,3) model, vocab, ep = train.train_nli(train_dataset, validation_dataset, BATCH_SIZE, use_elmo=use_elmo, num_filters=100, filter_sizes=filter_sizes) elif name == 'bert': model, vocab, ep = train.train_bert(train_dataset, validation_dataset, BATCH_SIZE, pretrained_model=BERT_MODEL, dense_vector=False, double_input=double_input) elif name == 'dense_bert': model, vocab, ep = train.train_bert(train_dataset, validation_dataset, BATCH_SIZE, pretrained_model=BERT_MODEL, dense_vector=True, col_name=context, double_input=double_input) else: sys.exit("'name' not valid") n_epochs.append(ep) # keep track of number of actual training epochs for each fold # Predict and evaluate model on test set: preds = evaluate.make_predictions(model, vocab, test_dataset, BATCH_SIZE, use_gpu=False) # NOTE: preds is of shape (number of samples, 2) - the columns represent the probabilities for the two classes in order ['yes_unp', 'not_unp'] f1, auroc, w_f1, precision, recall, acc, auprc = evaluate.compute_metrics(preds, test_dataset) if save_predictions: # save predictions for error analysis replyid_pred = evaluate.map_id_prediction(preds, test_dataset) if set(replyid_pred.keys()).intersection(set(map_reply_id_pred_probability.keys())) != set(): # sanity check sys.exit("Error: There is overlap in Test IDs across folds.") map_reply_id_pred_probability.update(replyid_pred) if save_model: # save the model weights and vocabulary with open('./tmp/'+name+'_model_conf_'+conf.split('-')[1]+'_fold_'+str(fold_number)+'.th', 'wb') as f: torch.save(model.state_dict(), f) vocab.save_to_files("./tmp/"+name+"_vocabulary_"+conf.split('-')[1]+"_fold_"+str(fold_number)) print("\nFold #{} | F1 = {} | AUROC = {} | AUPRC = {}".format(fold_number, f1, auroc, auprc)) f1s.append(f1); AUROCs.append(auroc); weighted_f1s.append(w_f1); precision_s.append(precision); recall_s.append(recall); accuracies.append(acc); AUPRCs.append(auprc) mean_f1 = np.array(f1s).mean(); mean_auroc = np.array(AUROCs).mean(); mean_weighted_f1 = np.array(weighted_f1s).mean(); mean_precision = np.array(precision_s).mean(); mean_recall = np.array(recall_s).mean(); mean_accuracy = np.array(accuracies).mean(); mean_auprc = np.array(AUPRCs).mean() print("Total predictions: {} | Save Predictions: {}".format(len(map_reply_id_pred_probability), save_predictions)) return mean_f1, mean_auroc, mean_weighted_f1, mean_precision, mean_recall, mean_accuracy, mean_auprc, map_reply_id_pred_probability, n_epochs
def multiprocess_single_sequence_loader(process_number: int, _config, _queue: mp.Queue, _wait_for_exit: mp.Event, _local_file, _fasttext_vocab_cached_mapping, _fasttext_vocab_cached_data): torch.manual_seed(_config["random_seed"]) numpy.random.seed(_config["random_seed"]) random.seed(_config["random_seed"]) if _config["token_embedder_type"] == "bert_cls": _tokenizer = BlingFireTokenizer() _ind = PretrainedBertIndexer( pretrained_model=_config["bert_pretrained_model"], do_lowercase=True) _token_indexers = {"tokens": _ind} _tuple_loader = IrSingleSequenceDatasetReader( lazy=True, tokenizer=_tokenizer, token_indexers=_token_indexers, max_seq_length=_config["max_doc_length"], min_seq_length=_config["min_doc_length"], ) _iterator = BucketIterator(batch_size=int(_config["batch_size_eval"]), sorting_keys=[("seq_tokens", "num_tokens")]) _iterator.index_with(Vocabulary.from_files(_config["vocab_directory"])) else: _tokenizer = BlingFireTokenizer() if _config["token_embedder_type"] == "embedding": _token_indexers = { "tokens": SingleIdTokenIndexer(lowercase_tokens=True) } _vocab = Vocabulary.from_files(_config["vocab_directory"]) elif _config["token_embedder_type"] == "fasttext": _token_indexers = { "tokens": FastTextNGramIndexer(_config["fasttext_max_subwords"]) } _vocab = FastTextVocab(_fasttext_vocab_cached_mapping, _fasttext_vocab_cached_data, _config["fasttext_max_subwords"]) elif _config["token_embedder_type"] == "elmo": _token_indexers = {"tokens": ELMoTokenCharactersIndexer()} _vocab = None _tuple_loader = IrSingleSequenceDatasetReader( lazy=True, tokenizer=_tokenizer, token_indexers=_token_indexers, max_seq_length=_config["max_doc_length"], min_seq_length=_config["min_doc_length"], ) _iterator = BucketIterator(batch_size=int(_config["batch_size_eval"]), sorting_keys=[("seq_tokens", "num_tokens")]) _iterator.index_with(_vocab) for training_batch in _iterator(_tuple_loader.read(_local_file), num_epochs=1): _queue.put( training_batch) # this moves the tensors in to shared memory _queue.put(None) # signal end of queue _queue.close() # indicate this local thread is done _wait_for_exit.wait( ) # keep this process alive until all the shared memory is used and not needed anymore