def tokenize_and_cache_data(data_dir, output_dir, tokenizer=None, tokenizer_path=None, n_sentences=0, use_overflow=False, two_segments=True, delete_existing=False, max_length=512): if not tokenizer: tokenizer = BertWordPieceTokenizer(tokenizer_path) tokenizer.enable_truncation(max_length=max_length) tokenizer.enable_padding(max_length=max_length) num_tokens = 0 num_examples = 0 if delete_existing: rmtree(output_dir) os.makedirs(output_dir, exist_ok=True) pbar = tqdm(os.listdir(data_dir)) for path in pbar: result = process_one_file(data_dir, path, tokenizer, output_dir, n_sentences, use_overflow, two_segments) num_examples += result['num_examples'] num_tokens += result['num_tokens'] pbar.set_description( f"{num_tokens} tokens, {num_examples} examples, {num_tokens/(num_examples*max_length)} non-pad tokens" )
def get_transformer_tokenizer(vocab_path, max_tokens, device="cpu"): """ Return a tokenizer to be used with Transformer-based models """ wp_tokenizer = BertWordPieceTokenizer(vocab_path, lowercase=True) wp_tokenizer.enable_padding(direction="right", pad_type_id=1) wp_tokenizer.enable_truncation(max_tokens) return TransformerSquadTokenizer(wp_tokenizer, device=device)
def __init__(self, tokenizer: AutoTokenizer, file_path: str, args): print(file_path) assert os.path.isfile(file_path) directory, filename = os.path.split(file_path) cached_features_file = os.path.join( directory, args.bert_model_type + "_cached_mlm_" + filename) if os.path.exists(cached_features_file): print("Loading features from cached file %s", cached_features_file) with open(cached_features_file, "rb") as handle: self.samples = torch.load(handle) else: print("Creating features from dataset file at %s", directory) # Get the faster tokenizer from tokenizers package tokenizer.save_vocabulary(vocab_path='.') fast_tokenizer = BertWordPieceTokenizer("vocab.txt", lowercase=args.lowercase) fast_tokenizer.enable_truncation(tokenizer.max_len) fast_tokenizer.enable_padding(max_length=tokenizer.max_len, pad_token=tokenizer.pad_token) self.samples = [] # Load data over here df = pd.read_json(file_path) print('SQUAD data: ') for _, row in tqdm(df.iterrows(), total=df.shape[0]): for paragraph in row['data']['paragraphs']: context = paragraph['context'] for qa_pair in paragraph['qas']: question = qa_pair['question'] batch = fast_tokenizer.encode(question, context) self.samples.append({ 'input_ids': batch.ids, 'attention_mask': batch.attention_mask }) for encoding in batch.overflowing: self.samples.append({ 'input_ids': encoding.ids, 'attention_mask': encoding.attention_mask }) df = None print("Saving features into cached file: ", cached_features_file) with open(cached_features_file, "wb") as handle: torch.save(self.samples, handle, pickle_protocol=pickle.HIGHEST_PROTOCOL)
def tokenize(texts: pd.Series, tokenizer: BertWordPieceTokenizer, chunk_size: int = 240, maxlen: int = 512) -> np.array: '''Tokenize input text, return in a form of array''' tokenizer.enable_truncation(max_length=maxlen) try: tokenizer.enable_padding(max_length=maxlen) except TypeError: tokenizer.enable_padding(length=maxlen) all_ids = [] for i in range(0, len(texts), chunk_size): text_chunk = texts[i:i + chunk_size].tolist() encs = tokenizer.encode_batch(text_chunk) all_ids.extend([enc.ids for enc in encs]) return np.array(all_ids)
class BERT16SDataset(Dataset): """ A torch dataset class designed to load a 16S data found in a tsv file and encode it for BERT. :param vocab_path: str, path to the pre-trained bert tokenizer vocab file. :param data_path: str, path to the 16S data file. :param block_size: str, maximal BERT input (an encoded sample will be padded to this length if too short) :param max_word_length: int, the maximal word length the tokenizer can encode. """ def __init__(self, vocab_path: str, data_path: str, block_size=512, max_word_length=100): assert os.path.isfile(data_path) assert os.path.isfile(vocab_path) _logger.info(f"Loading BERT tokenizer using vocab file {vocab_path}") self.tokenizer = BertWordPieceTokenizer( vocab_path, handle_chinese_chars=False, lowercase=False) self.tokenizer.enable_truncation(block_size) self.tokenizer.enable_padding(max_length=block_size) _logger.info(f"Loading 16S dataset file at {data_path}...") self._16s_corpus_df = pd.read_csv(data_path, sep='\t') _logger.info(f"16S corpus is of shape {self._16s_corpus_df.shape}") self.samples = self._16s_corpus_df.seq.values.tolist() self.max_word_length = max_word_length def __len__(self): return len(self._16s_corpus_df) def __getitem__(self, i): sample = self._split_sequence_by_max_word_length(self.samples[i]) tokens = self.tokenizer.encode(sample) return torch.tensor(tokens.ids, dtype=torch.long) def _split_sequence_by_max_word_length(self, seq): """ split a 16S sequence (~1K long usually) into white-spaces separated chunks that the tokenizer can encode. :param seq: str, 16S sequence :return: str """ chunks = [seq[i: i + self.max_word_length] for i in range(0, len(seq), self.max_word_length)] return ' '.join(chunks)
class Tokenizer: def __init__(self, lang): """ A Tokenizer class to load and train a custom tokenizer Using the Hugging Face tokenization library for the same """ self.tokenizer_dir = r"data/{}".format(lang) if not os.path.exists(self.tokenizer_dir): os.mkdir(self.tokenizer_dir) self.vocab = self.tokenizer_dir + "/vocab.txt" if os.path.exists(self.vocab): print("Initialized tokenizer using cached vocab file {}".format(self.vocab)) self.tokenizer = BertWordPieceTokenizer(vocab_file=self.vocab) else: self.tokenizer = BertWordPieceTokenizer() self.tokenizer.enable_padding(max_length=MAX_LENGTH) self.tokenizer.enable_truncation(max_length=MAX_LENGTH) def train_tokenizer(self, sentences): """ Train a tokenizer with a list of sentences """ if not os.path.exists(self.vocab): print("Training tokenizer for {}".format(self.tokenizer_dir)) # Hugging Face only accepts a Temp File with sentences for Training Tokenizer with open(self.tokenizer_dir + "/data.txt", "w+", encoding="utf-8") as f: [f.write(i + "\n") for i in sentences] self.tokenizer.train([self.tokenizer_dir + "/data.txt"]) self.tokenizer.save(self.tokenizer_dir) print("Trained a tokenizer with vocab size {}".format(self.tokenizer.get_vocab_size())) # Removing the temp file os.remove(self.tokenizer_dir + "/data.txt") def encode(self, decoded): return self.tokenizer.encode(decoded) def decode(self, encoded): return self.tokenizer.decode_batch(encoded)
def get_preds(list_of_texts): transformer_layer = (transformers.TFDistilBertModel.from_pretrained( 'distilbert-base-multilingual-cased')) model = build_model(transformer_layer, max_len=MAX_LEN) model.load_weights('model/weights') #model = tf.keras.models.load_model('model') print('weights loaded') tokenizer = transformers.DistilBertTokenizer.from_pretrained( 'distilbert-base-multilingual-cased') tokenizer.save_pretrained('.') # Reload it with the huggingface tokenizers library fast_tokenizer = BertWordPieceTokenizer('vocab.txt', lowercase=False) fast_tokenizer.enable_truncation(max_length=MAX_LEN) fast_tokenizer.enable_padding(length=MAX_LEN) all_ids = [] encs = fast_tokenizer.encode_batch(list_of_texts) all_ids.extend([enc.ids for enc in encs]) all_ids = np.array(all_ids).astype(np.float32) to_predict = create_test(all_ids) predictions = model.predict(to_predict) #print(predictions*10) for prediction in predictions: print(prediction) dic = {'predictions': predictions} parsed = [] #response = pd.DataFrame(dic) #parsed = response.to_json(orient = 'columns') #not sure if works #json.dumps(parsed) #to be reviewed return parsed, predictions
def train(args, rep): # Set random seed random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) # Rename output dir based on arguments if args.output_dir == "": cwd = os.getcwd() base = args.model_name_or_path.split("/")[-1] model_type = "_example" if args.example else "_linear" data_path = '_' + '_'.join( args.train_data_path.split("/")[-2:]).replace(".csv", "") mlm_on = "_mlmtrain" if args.mlm_data_path == "" or args.mlm_data_path == args.train_data_path else "_mlmfull" mlm_pre = "_mlmpre" if args.mlm_pre else "" mlm_dur = "_mlmdur" if args.mlm_during else "" observer = "_observer" if args.use_observers else "" name = base + model_type + data_path + mlm_on + mlm_pre + mlm_dur + observer + "_v{}".format( rep) args.output_dir = os.path.join(cwd, "checkpoints", name) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) elif args.num_epochs == 0: pass else: raise Exception("Directory {} already exists".format(args.output_dir)) #pass json.dump(args.__dict__, open(os.path.join(args.output_dir, 'args.json'), "w+")) # Save args torch.save(args, os.path.join(args.output_dir, "run_args")) # Configure tensorboard writer tb_writer = SummaryWriter(log_dir=args.output_dir) # Configure tokenizer token_vocab_name = os.path.basename(args.token_vocab_path).replace( ".txt", "") tokenizer = BertWordPieceTokenizer(args.token_vocab_path, lowercase=args.do_lowercase) tokenizer.enable_padding(max_length=args.max_seq_length) tokenizer.save(args.output_dir + "/tokenizer") # Data readers if args.task == "intent": dataset_initializer = IntentDataset elif args.task == "slot": if 'taskmaster' in args.train_data_path: dataset_initializer = TMSlotDataset else: dataset_initializer = SlotDataset elif args.task == "response": dataset_initializer = ResponseSelectionDataset elif args.task == "dst": dataset_initializer = StateTrackingDataset elif args.task == "top": dataset_initializer = TOPDataset else: raise ValueError("Not a valid task type: {}".format(args.task)) train_dataset = dataset_initializer(args.train_data_path, tokenizer, args.max_seq_length, token_vocab_name) if args.mlm_data_path != '': mlm_dataset = dataset_initializer(args.mlm_data_path, tokenizer, args.max_seq_length, token_vocab_name) else: mlm_dataset = train_dataset val_dataset = dataset_initializer( args.val_data_path, tokenizer, 512, token_vocab_name) if args.val_data_path else None test_dataset = dataset_initializer(args.test_data_path, tokenizer, 512, token_vocab_name) # Data loaders train_dataloader = DataLoader(dataset=train_dataset, batch_size=args.train_batch_size, shuffle=True, pin_memory=True) mlm_dataloader = DataLoader(dataset=mlm_dataset, batch_size=args.train_batch_size, shuffle=True, pin_memory=True) val_dataloader = DataLoader(dataset=val_dataset, batch_size=1, pin_memory=True) if val_dataset else None test_dataloader = DataLoader(dataset=test_dataset, batch_size=1, pin_memory=True) # Load model if args.task == "intent": if args.example: model = ExampleIntentBertModel( args.model_name_or_path, dropout=args.dropout, num_intent_labels=len(train_dataset.intent_label_to_idx), use_observers=args.use_observers) else: model = IntentBertModel(args.model_name_or_path, dropout=args.dropout, num_intent_labels=len( train_dataset.intent_label_to_idx), use_observers=args.use_observers) elif args.task == "slot": if args.example: model = ExampleSlotBertModel(args.model_name_or_path, dropout=args.dropout, num_slot_labels=len( train_dataset.slot_label_to_idx), use_observers=args.use_observers) else: model = SlotBertModel(args.model_name_or_path, dropout=args.dropout, num_slot_labels=len( train_dataset.slot_label_to_idx), use_observers=args.use_observers) elif args.task == "response": model = ResponseSelectionBertModel(args.model_name_or_path, dropout=args.dropout) elif args.task == "dst": model = StateTrackingBertModel( args.model_name_or_path, dropout=args.dropout, num_slot_labels=train_dataset.slot_lengths) elif args.task == "top": if args.example: model = ExampleJointSlotIntentBertModel( args.model_name_or_path, dropout=args.dropout, num_intent_labels=len(train_dataset.intent_label_to_idx), num_slot_labels=len(train_dataset.slot_label_to_idx)) else: model = JointSlotIntentBertModel( args.model_name_or_path, dropout=args.dropout, num_intent_labels=len(train_dataset.intent_label_to_idx), num_slot_labels=len(train_dataset.slot_label_to_idx)) else: raise ValueError("Cannot instantiate model for task: {}".format( args.task)) if torch.cuda.is_available(): model.to(args.device) if args.mlm_pre or args.mlm_during: pre_model = BertPretrain(args.model_name_or_path) mlm_optimizer = AdamW(pre_model.parameters(), lr=args.learning_rate, eps=args.adam_epsilon) if torch.cuda.is_available(): pre_model.to(args.device) # MLM Pre-train if args.mlm_pre and args.num_epochs > 0: # Maintain most recent score per label. for epoch in trange(3, desc="Pre-train Epochs"): pre_model.train() epoch_loss = 0 num_batches = 0 for batch in tqdm(mlm_dataloader): num_batches += 1 # Train model if "input_ids" in batch: inputs, labels = mask_tokens(batch["input_ids"].cuda(), tokenizer) else: inputs, labels = mask_tokens(batch["ctx_input_ids"].cuda(), tokenizer) loss = pre_model(inputs, labels) if args.grad_accum > 1: loss = loss / args.grad_accum loss.backward() epoch_loss += loss.item() if args.grad_accum <= 1 or num_batches % args.grad_accum == 0: if args.max_grad_norm > 0: torch.nn.utils.clip_grad_norm_(pre_model.parameters(), args.max_grad_norm) mlm_optimizer.step() pre_model.zero_grad() LOGGER.info("Epoch loss: {}".format(epoch_loss / num_batches)) # Transfer BERT weights model.bert_model = pre_model.bert_model.bert # Train optimizer = AdamW(model.parameters(), lr=args.learning_rate, eps=args.adam_epsilon) global_step = 0 metrics_to_log = {} best_score = -1 patience = 0 for epoch in trange(args.num_epochs, desc="Epoch"): model.train() epoch_loss = 0 num_batches = 0 if args.task == "top" and args.example: # Pre-fill cache but don't return anything retrieve_examples(train_dataset, None, None, task="top") for batch in tqdm(train_dataloader): num_batches += 1 global_step += 1 # Transfer to gpu if torch.cuda.is_available(): for key, val in batch.items(): if type(batch[key]) is list: continue batch[key] = batch[key].to(args.device) # Train model if args.task == "intent": if args.example: examples = retrieve_examples(train_dataset, batch["intent_label"], batch["ind"], task="intent") _, intent_loss = model( input_ids=batch["input_ids"], attention_mask=batch["attention_mask"], token_type_ids=batch["token_type_ids"], intent_label=batch["intent_label"], example_input=examples["input_ids"], example_mask=examples["attention_mask"], example_token_types=examples["token_type_ids"], example_intents=examples["intent_label"]) else: _, intent_loss = model( input_ids=batch["input_ids"], attention_mask=batch["attention_mask"], token_type_ids=batch["token_type_ids"], intent_label=batch["intent_label"]) if args.grad_accum > 1: intent_loss = intent_loss / args.grad_accum intent_loss.backward() epoch_loss += intent_loss.item() elif args.task == "slot": if args.example: examples = retrieve_examples(train_dataset, batch["slot_labels"], batch["ind"], task="slot", num=64) _, slot_loss = model( input_ids=batch["input_ids"], attention_mask=batch["attention_mask"], token_type_ids=batch["token_type_ids"], slot_labels=batch["slot_labels"], example_word_inds=examples["word_ind"], example_input=examples["input_ids"], example_mask=examples["attention_mask"], example_token_types=examples["token_type_ids"], example_slots=examples["slot_labels"]) else: _, slot_loss = model( input_ids=batch["input_ids"], attention_mask=batch["attention_mask"], token_type_ids=batch["token_type_ids"], slot_labels=batch["slot_labels"]) if args.grad_accum > 1: slot_loss = slot_loss / args.grad_accum slot_loss.backward() epoch_loss += slot_loss.item() elif args.task == "response": resp_loss = model( ctx_input_ids=batch["ctx_input_ids"], ctx_attention_mask=batch["ctx_attention_mask"], ctx_token_type_ids=batch["ctx_token_type_ids"], rsp_input_ids=batch["rsp_input_ids"], rsp_attention_mask=batch["rsp_attention_mask"], rsp_token_type_ids=batch["rsp_token_type_ids"]) resp_loss.backward() epoch_loss += resp_loss.item() elif args.task == "dst": _, state_loss = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"], token_type_ids=batch["token_type_ids"], state_label=batch["state_label"]) state_loss.backward() epoch_loss += state_loss.item() elif args.task == "top": if args.example: # Get intent examples intent_examples = retrieve_examples(train_dataset, batch["intent_label"], batch["ind"], task="intent", num=32) # Get slot examples slot_examples = retrieve_examples(train_dataset, batch["slot_labels"], batch["ind"], task="slot", num=32) loss = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"], token_type_ids=batch["token_type_ids"], intent_label=batch["intent_label"], slot_labels=batch["slot_labels"], intent_examples=intent_examples, slot_examples=slot_examples) else: _, _, loss = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"], token_type_ids=batch["token_type_ids"], intent_label=batch["intent_label"], slot_labels=batch["slot_labels"]) if args.grad_accum > 1: loss = loss / args.grad_accum loss.backward() epoch_loss += loss.item() if args.grad_accum <= 1 or num_batches % args.grad_accum == 0: if args.max_grad_norm > 0: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() model.zero_grad() LOGGER.info("Epoch loss: {}".format(epoch_loss / num_batches)) # Evaluate and save checkpoint score = evaluate(model, val_dataloader, train_dataloader, tokenizer, task=args.task, example=args.example, device=args.device) metrics_to_log["eval_score"] = score LOGGER.info("Task: {}, score: {}---".format(args.task, score)) if score < best_score: patience += 1 else: patience = 0 if score > best_score: LOGGER.info("New best results found for {}! Score: {}".format( args.task, score)) torch.save(model.state_dict(), os.path.join(args.output_dir, "model.pt")) torch.save(optimizer.state_dict(), os.path.join(args.output_dir, "optimizer.pt")) best_score = score for name, val in metrics_to_log.items(): tb_writer.add_scalar(name, val, global_step) if patience >= args.patience: LOGGER.info("Stopping early due to patience") break # Run MLM during training if args.mlm_during: pre_model.train() epoch_loss = 0 num_batches = 0 for batch in tqdm(mlm_dataloader): num_batches += 1 # Train model if "input_ids" in batch: inputs, labels = mask_tokens(batch["input_ids"].cuda(), tokenizer) else: inputs, labels = mask_tokens(batch["ctx_input_ids"].cuda(), tokenizer) loss = pre_model(inputs, labels) if args.grad_accum > 1: loss = loss / args.grad_accum loss.backward() epoch_loss += loss.item() if args.grad_accum <= 1 or num_batches % args.grad_accum == 0: if args.max_grad_norm > 0: torch.nn.utils.clip_grad_norm_(pre_model.parameters(), args.max_grad_norm) mlm_optimizer.step() pre_model.zero_grad() LOGGER.info("MLMloss: {}".format(epoch_loss / num_batches)) # Evaluate on test set LOGGER.info("Loading up best model for test evaluation...") model.load_state_dict(torch.load(os.path.join(args.output_dir, "model.pt"))) score = evaluate(model, test_dataloader, train_dataloader, tokenizer, task=args.task, example=args.example, device=args.device) print("Best result for {}: Score: {}".format(args.task, score)) tb_writer.add_scalar("final_test_score", score, global_step) tb_writer.close() return score
# FAZ O DOWNLOAD DO PRE-TREINADO EM PT-BT if not os.path.exists('bert-base-portuguese-cased_pytorch_checkpoint.zip'): wget.download("https://neuralmind-ai.s3.us-east-2.amazonaws.com/nlp/bert-base-portuguese-cased/bert-base-portuguese-cased_pytorch_checkpoint.zip") !unzip bert-base-portuguese-cased_pytorch_checkpoint.zip -d bert-portuguese # CRIA O TOKENIZER A PARTIR DE UM VOCABULÁRIO # LOWERCASE = FALSE (NÃO IRÁ CONVERTER AS ENTRADAS PARA LOWERCASE. MANTEM O ORGINIAL) # STRIP ACCENTS = FALSE (MANTEM OS ACENTOS) tokenizer = BertWordPieceTokenizer("vocab.txt", lowercase=False, strip_accents=False) # MOSTRA AS INFORMAÇÕES DO TONENIZER print(tokenizer) # PERMITE O TRUNCATION E O PADDING tokenizer.enable_truncation(max_length=60) tokenizer.enable_padding() # TOKENINZA EM BATCH TODAS AS SENTENÇAS # TEM QUE USAR .TOLIST PARA CONVERTER POR LISTA. SENTENCAS É UM ARRAY NUMPY output = tokenizer.encode_batch(sentencas.tolist()) # O TOKENIZER RETORAR UMA LISTA DE OBJETOS DO TIPO TOKENIZER # PRECISAMOS PEGAR OS ATRIBUTOS IDS E MASKS E ADICIONAR PARA LISTAS # OS OBJETOS TEM O ATRIBUTO IDS(IDS), TOKENS (TOKENS) E attention_mask # PRECISAMOS FAXER O FOR PARA PEGAR CADA UM E DEPOIS CRIAR A LISTA ids=[x.ids for x in output] attention_mask = [x.attention_mask for x in output] print(len(ids)) print(len(attention_mask))
class InferNER(object): def __init__(self, head_directories, head_configs, device=None, from_huner=False, lowercase=False): """ :param head_directories: list containing the directory paths to the head models. :param head_configs: a list containing the paths to the head config files. :param device: One of 'cpu' or 'cuda'. Defaults to 'cpu'. :param lowercase: preprocessing option. If predicting an entity type, like Gene, where the case matters, set to False (default). """ # SET DEVICE if not device: self.device = "cuda" if torch.cuda.is_available() else "cpu" else: self.device = device assert len(head_directories) == len(head_configs) # LOAD TOKENIZER AND MODELS self.models = [] for i, head in enumerate(head_directories): # LOAD BASE MODEL print(f'Loading BERT pre-trained model {head}') self.bert = BertModel.from_pretrained(head, from_tf=False) # LOAD HEAD print(f'Loading {head}') path_to_head_config = os.path.join(head, head_configs[i]) self.path_to_vocab = os.path.join(head, 'vocab.txt') self.head_directory = head self.head_config = BertConfig.from_pretrained(path_to_head_config) head_config_dict = json.load(open(os.path.join(self.head_directory, head_configs[i]), 'rb')) self.head = SubwordClassificationHead(head_config_dict['head_task'], labels=head_config_dict['labels']) print(self.head.from_pretrained(self.head_directory)) # Collect models self.models.append({'head': self.head, 'base': self.bert, 'entity_type': head.split('_')[-3], 'dataset': head.split('_')[-4]}) # LOAD TOKENIZER AND SET OPTIONS print('Loading Tokenizer and setting options') self.tokenizer = BertWordPieceTokenizer(self.path_to_vocab, # uses last head loaded for vocab lowercase=lowercase) self.tokenizer.enable_padding(direction='right', pad_id=0, max_length=self.head_config.max_position_embeddings) # CONSTRUCT PROCESSORS head_name = os.path.basename(self.head_directory) self.sentencizer = TransformersLanguage(trf_name=head_name, meta={"lang": "en"}) self.sentencizer.add_pipe(self.sentencizer.create_pipe("sentencizer")) print('Loaded BERT head, config, tokenizer, and sentencizer') self.labels = sorted(self.head.config.labels) # Fine-tuning may have been done on sorted labels. self.from_huner = from_huner def run_document(self, path_to_document, output_filename=None, output_directory="."): output_filename_fullpath = os.path.join(output_directory, output_filename) if os.path.exists(output_filename_fullpath): print(f"{output_filename_fullpath} exists, skipping") return with open(path_to_document, encoding='utf-8') as f: document_as_string = f.read() # does this work for large documents? self.output_dict = {'tokens': [], 'sentence_spans': [], 'document_spans': [], 'probability': [], 'labels': [] } sentencized_document = self.sentencizer(document_as_string) number_of_sentences = len(list(sentencized_document.sents)) test_stop = 10000000 # number_of_sentences = test_stop for model in self.models: self.head = model['head'] self.bert = model['base'] if self.from_huner: model_entity_type = model['entity_type'] model_dataset = model['dataset'] document_entity_type = os.path.basename(path_to_document).split("_")[0] document_dataset = os.path.basename(path_to_document).split("_")[1].replace('.txt', '') if model_entity_type != document_entity_type: print(model_entity_type, document_entity_type) continue if model_dataset != document_dataset: print(model_dataset, document_dataset) continue for sentence_idx, sentence in enumerate(sentencized_document.sents): annotation_start = time.time() if sentence_idx > test_stop: break print(f'\nAnnotating sentence {sentence_idx} of {number_of_sentences}') self.sentence = sentence self.sentence_idx = sentence_idx self.sentence_encoding = self.tokenizer.encode(self.sentence.string) if len(self.sentence_encoding) > 512: print(f"In document {os.path.basename(output_filename)}, this sentence exeeds the maximum token sequence size\n{self.sentence}") print("Skipping document") raise Exception # PREPARE MODEL INPUT input_ids = torch.tensor([self.sentence_encoding.ids], dtype=torch.long) attention_mask = torch.tensor([self.sentence_encoding.attention_mask], dtype=torch.long) token_type_ids = torch.tensor([self.sentence_encoding.type_ids], dtype=torch.long) self.document = sentencized_document self.tokens = self.sentence_encoding.tokens self.spans = self.sentence_encoding.offsets self.input_ids = input_ids # RUN EXAMPLE THROUGH BERT self.bert.eval() if not next(self.bert.parameters()).is_cuda: self.bert.to(device=self.device) self.head.eval() if not next(self.head.parameters()).is_cuda: self.head.to(device=self.device) with torch.no_grad(): print(f"BERT Head: {self.head}") print(f"On {self.device} device") input_ids = input_ids.to(device=self.device) attention_mask = attention_mask.to(device=self.device) self.bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, # token_type_ids=token_type_ids, token_type_ids=None, position_ids=None)[0] self.subword_scores = self.head(self.bert_outputs)[0] self.subword_scores_softmax = softmax(self.subword_scores, dim=2) # Get probabilities for each label self.predicted_label_keys = self.subword_scores_softmax.max(2)[1][0] self.predicted_label_probabilities = self.subword_scores_softmax.max(2)[0][0].cpu().numpy() self.labels = sorted(self.head.config.labels) self.predicted_labels = [self.labels[label_key] for label_key in self.predicted_label_keys.cpu().numpy()] token_mask = self.sentence_encoding.special_tokens_mask # List of indices containing subwords subwords_idx = [index_of_subword for index_of_subword, mask in enumerate(token_mask) if mask == 0] self.predicted_label_probabilities = [self.predicted_label_probabilities[i] for i in subwords_idx] self.output_tokens = [self.sentence_encoding.tokens[i] for i in subwords_idx] # Print subword spans self.output_spans_within_sentence = [ " ".join([str(span_idx) for span_idx in self.sentence_encoding.offsets[i]]) for i in subwords_idx] self.output_spans_within_document = [" ".join( [str(span_idx + self.sentence.start_char) for span_idx in self.sentence_encoding.offsets[i]]) for i in subwords_idx] # Print labels self.output_labels = [self.predicted_labels[i].replace("NP", model['entity_type']) for i in subwords_idx] # Generalize to task type # Update document output self.output_dict['tokens'] = self.output_dict['tokens'] + self.output_tokens self.output_dict['sentence_spans'] = self.output_dict[ 'sentence_spans'] + self.output_spans_within_sentence self.output_dict['document_spans'] = self.output_dict[ 'document_spans'] + self.output_spans_within_document self.output_dict['probability'] = self.output_dict[ 'probability'] + self.predicted_label_probabilities self.output_dict['labels'] = self.output_dict['labels'] + self.output_labels annotation_end = time.time() print( f'finished sentence {sentence_idx} of {number_of_sentences} in {annotation_end - annotation_start:0.2f} seconds') if self.output_dict: self.output_table = pd.DataFrame.from_dict(self.output_dict) if output_filename: self.output_table.to_csv(output_filename_fullpath, sep='\t', header=True, index=True, index_label="#") else: self.output_table.to_csv(os.path.join(output_directory, 'example_output.tsv'), sep='\t', header=True, index=True, index_label="#") def run_all_documents(self, path_to_document_dir, output_directory=".", recursive=False): print('started running all documents') if not os.path.exists(output_directory): os.makedirs(output_directory) file_list = [] if recursive: print(f'Looking for files to add in {path_to_document_dir}. Searching Recursively') for root, directories, filenames in os.walk(path_to_document_dir): for filename in filenames: file_list.append(os.path.join(root, filename)) else: print(f'Looking for files to add. Searching {path_to_document_dir}') for filename in os.listdir(path_to_document_dir): file_list.append(os.path.join(path_to_document_dir, filename)) log = open('infer.log', 'a') failed_list_log = open('infer_failed_list.log', 'a') for input_document in file_list: if not input_document.endswith(".txt"): continue output_basename = os.path.basename(input_document).replace('.txt', '') + "_biobert_annotated" output_filename = output_basename + ".tsv" # Check if the out file exists already, if so skip it. if os.path.exists(os.path.join(output_directory, output_filename)): print(f'Skipping document {input_document}. \nResults already in {output_directory}/{output_filename}') continue print(f'Running document {input_document}. \nSaving Results to {output_directory}/{output_filename}') try: self.run_document(input_document, output_filename, output_directory) except Exception as e: print(f"Failed to process {output_filename}. See log for error.") print(f"Failed to process {output_filename}: {e}", file=log) print(f"{output_filename}", file=failed_list_log) finally: pass def __str__(self): return self.document.sents
model.compile(Adam(lr=6e-6), loss='binary_crossentropy', metrics=['accuracy', 'AUC']) return model transformer_layer = (transformers.TFDistilBertModel.from_pretrained( 'distilbert-base-multilingual-cased')) model = build_model(transformer_layer, max_len=MAX_LEN) model.load_weights('/home/aziz/vneuron/model/weights') fast_tokenizer = BertWordPieceTokenizer('vocab.txt', lowercase=False) fast_tokenizer.enable_truncation(max_length=MAX_LEN) fast_tokenizer.enable_padding(length=MAX_LEN) app = Flask(__name__) @app.route('/') def index(): return render_template('index.html') @app.route('/predict', methods=['POST']) def predict(): text = request.form['content'] text = [str(text)] all_ids = []
parser.add_argument("--batch_size", type=int, default=1) args = parser.parse_args() settings = dm_fast_mapping.EnvironmentSettings(seed=0, level_name=args.level_name) env = dm_fast_mapping.load_from_docker(settings) wrapped_env = FastSlowEnvWrapper(env) temp_timestep = wrapped_env.reset() vision_dim = temp_timestep.observation["RGB_INTERLEAVED"].shape tokenizer = BertWordPieceTokenizer( DIR_PATH + "/embedding_files/bert-base-uncased-vocab.txt", lowercase=True) tokenizer.enable_padding(pad_id=3, length=args.language_dim, pad_token="[PAD]") policy = Agent( language_dim=args.language_dim, vision_dim=vision_dim, num_embeddings=args.num_embeddings, embedding_dim=args.embedding_dim, memory_hidden_dim=args.memory_hidden_dim, tokenizer=tokenizer, ) evaluations = [eval_policy(policy, settings)] replay_buffer = MultiModalReplayBuffer( args.buffer_size,
def tokenizer( self) -> Union[BaseTokenizer, CountVectorizer, TfidfVectorizer]: pkl_path = os.path.join(self.tokenizer_path, "model.pkl") if self._tokenizer is not None: return self._tokenizer ### get pickled tokenizer if os.path.exists(pkl_path) and not self.retrain_tokenizer: with open(pkl_path, 'rb') as f: tokenizer = pickle.load(f) ### train new tokenizer else: self.retrain_tokenizer = False if self.algorithm == 'bert': from tokenizers import BertWordPieceTokenizer tokenizer = BertWordPieceTokenizer( vocab_file=None if self._init_vocabulary is None else os. path.join(self.cache_path, "bert_vocab.txt")) tokenizer.enable_truncation(max_length=self.max_length) tokenizer.enable_padding(length=self.max_length) # train the tokenizer if self._init_vocabulary is None: path = os.path.join(self.cache_path, 'train.txt') with open(path, 'w') as f: for i in chain(self.train_text, self.valid_text, self.test_text): if len(i) == 0: continue f.write(i + "\n" if i[-1] != "\n" else i) tokenizer.train(files=path, vocab_size=self.vocab_size, min_frequency=self.min_frequency, limit_alphabet=self.limit_alphabet, show_progress=True) tokenizer.save_model(self.tokenizer_path) elif self.algorithm in ('count', 'tf', 'tfidf'): if self.algorithm == 'count': tokenizer = CountVectorizer( input='content', ngram_range=self.ngram_range, min_df=self.min_frequency, max_df=self.max_frequency, max_features=self.vocab_size, vocabulary=self._init_vocabulary, tokenizer=_simple_tokenizer, stop_words='english') elif self.algorithm in ('tf', 'tfidf'): tokenizer = TfidfVectorizer( input='content', ngram_range=self.ngram_range, min_df=self.min_frequency, max_df=self.max_frequency, max_features=self.vocab_size, stop_words='english', vocabulary=self._init_vocabulary, tokenizer=_simple_tokenizer, use_idf=False if self.algorithm == 'tf' else True) tokenizer.fit((_simple_preprocess(i) for i in chain( self.train_text, self.valid_text, self.test_text))) else: raise NotImplementedError # save the pickled model with open(pkl_path, "wb") as f: pickle.dump(tokenizer, f) ### assign and return self._tokenizer = tokenizer return self._tokenizer
# Inference runtime import onnxruntime as ort from tokenizers import BertWordPieceTokenizer # Helper scripts from .PreprocessData import normalize_text, truncate_text from .Predict import get_ids_and_masks, predict # Initialize ONNX runtime and language model tokenizer vocab_file_path = os.path.join(os.path.dirname(__file__), "Model/bert-base-uncased-vocab.txt") onnx_file_path = os.path.join(os.path.dirname(__file__), "Model/watchdog_model.onnx") tokenizer = BertWordPieceTokenizer(vocab_file_path) tokenizer.enable_padding(pad_id=0, pad_token="[PAD]", length=128) tokenizer.enable_truncation(max_length=128) ort_session = ort.InferenceSession(onnx_file_path) def main(req: func.HttpRequest) -> func.HttpResponse: logging.info('Invoked TextQualityWatchdog Skill.') try: body = json.dumps(req.get_json()) if body: logging.info(body) values = json.loads(body)['values'] results = {}
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.") ## Other parameters parser.add_argument("--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument("--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name") parser.add_argument("--max_seq_length", default=512, type=int, help="The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--evaluate_during_training", action='store_true', help="Rul evaluation during training at each logging step.") parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument("--num_eval_docs", default=1000, type=int, help="number of docs per query in eval set.") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--max_steps", default=-1, type=int, help="If > 0: set total number of training steps to perform. Override num_train_epochs.") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument('--logging_steps', type=int, default=50, help="Log every X updates steps.") parser.add_argument('--save_steps', type=int, default=50, help="Save checkpoint every X updates steps.") parser.add_argument("--eval_all_checkpoints", action='store_true', help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number") parser.add_argument("--no_cuda", action='store_true', help="Avoid using CUDA when available") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument("--msmarco_output", action='store_true', help="Return msmarco output format file") parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") args = parser.parse_args() if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir: raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir)) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() args.n_gpu=1 else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl') args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt = '%m/%d/%Y %H:%M:%S', level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s", # args.local_rank, device, args.n_gpu, bool(args.local_rank != -1)) # Set seed set_seed(args) num_labels=2 config = BertConfig.from_pretrained("bert-base-uncased", num_labels=num_labels) tokenizer = BertWordPieceTokenizer(f"{args.data_dir}/bert_based_uncased_vocab.txt", lowercase=True) tokenizer.enable_truncation(args.max_seq_length) tokenizer.enable_padding('right',max_length=args.max_seq_length) model = BertForSequenceClassification.from_pretrained('bert-base-uncased', config=config) model.to(args.device) args.output_mode='classification' logger.info("Training/evaluation parameters %s", args) if args.do_train: dataset_path = f'{args.data_dir}/triples.unique.eq.train.small.csv' train_dataset=LazyTextDataset(dataset_path, tokenizer,args.max_seq_length) global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) # Evaluation results = {} if args.do_eval and args.local_rank in [-1, 0]: tokenizer = BertWordPieceTokenizer(f"{args.data_dir}/bert_based_uncased_vocab.txt", lowercase=True) tokenizer.enable_truncation(args.max_seq_length) tokenizer.enable_padding('right',max_length=args.max_seq_length) checkpoints = [args.output_dir] # can specifiy only one checkpoint checkpoints = [f'{args.data_dir}/checkpoint-{args.checkpoint}'] if args.eval_all_checkpoints: checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True))) logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else "" prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else "" model = BertForSequenceClassification.from_pretrained(checkpoint) model.to(args.device) evaluate(args, model, tokenizer, prefix=prefix, set_name='eval', global_step) return results if __name__ == "__main__": main()
def main(): start_time = time.time() args = parse_args() make_directories(args.output_dir) # Start Tensorboard and log hyperparams. tb_writer = SummaryWriter(args.output_dir) tb_writer.add_hparams(vars(args), {}) file_log_handler = logging.FileHandler( os.path.join(args.output_dir, 'log.txt')) logger.addHandler(file_log_handler) # Get list of text and list of label (integers) from disk. train_text, train_label_id_list, eval_text, eval_label_id_list = \ get_examples_and_labels(args.dataset) # Augment training data. if (args.augmentation_recipe is not None) and len( args.augmentation_recipe): import pandas as pd if args.augmentation_recipe == 'textfooler': aug_csv = '/p/qdata/jm8wx/research/text_attacks/textattack/outputs/attack-1590551967800.csv' elif args.augmentation_recipe == 'tf-adjusted': aug_csv = '/p/qdata/jm8wx/research/text_attacks/textattack/outputs/attack-1590564015768.csv' else: raise ValueError( f'Unknown augmentation recipe {args.augmentation_recipe}') aug_df = pd.read_csv(aug_csv) # filter skipped outputs aug_df = aug_df[aug_df['original_text'] != aug_df['perturbed_text']] print( f'Augmentation recipe {args.augmentation_recipe} / augmentation num. examples {args.augmentation_num}/ len {len(aug_df)}' ) original_text = aug_df['original_text'] perturbed_text = aug_df['perturbed_text'] # convert `train_text` and `train_label_id_list` to an np array so things are faster train_text = np.array(train_text) train_label_id_list = np.array(train_label_id_list) x_adv_list = [] x_adv_id_list = [] for (x, x_adv) in zip(original_text, perturbed_text): x = x.replace('[[', '').replace(']]', '') x_adv = x_adv.replace('[[', '').replace(']]', '') x_idx = (train_text == x).nonzero()[0][0] x_adv_label = train_label_id_list[x_idx] x_adv_id_list.append(x_adv_label) x_adv_list.append(x_adv) # truncate to `args.augmentation_num` examples if (args.augmentation_num >= 0): perm = list(range(len(x_adv_list))) random.shuffle(perm) perm = perm[:args.augmentation_num] x_adv_list = [x_adv_list[i] for i in perm] x_adv_id_list = [x_adv_id_list[i] for i in perm] train_text = train_text.tolist() + x_adv_list train_label_id_list = train_label_id_list.tolist() + x_adv_id_list print( f'Augmentation added {len(x_adv_list)} examples, for a total of {len(train_text)}' ) label_id_len = len(train_label_id_list) num_labels = len(set(train_label_id_list)) logger.info('num_labels: %s', num_labels) train_examples_len = len(train_text) if len(train_label_id_list) != train_examples_len: raise ValueError( f'Number of train examples ({train_examples_len}) does not match number of labels ({len(train_label_id_list)})' ) if len(eval_label_id_list) != len(eval_text): raise ValueError( f'Number of teste xamples ({len(eval_text)}) does not match number of labels ({len(eval_label_id_list)})' ) print_cuda_memory(args) # old INFO:__main__:Loaded data and tokenized in 189.66675066947937s # @TODO support other vocabularies, or at least, support case tokenizer = BertWordPieceTokenizer('bert-base-uncased-vocab.txt', lowercase=True) tokenizer.enable_padding(max_length=args.max_seq_len) tokenizer.enable_truncation(max_length=args.max_seq_len) logger.info(f'Tokenizing training data. (len: {train_examples_len})') train_text_ids = [ encoding.ids for encoding in tokenizer.encode_batch(train_text) ] logger.info(f'Tokenizing test data (len: {len(eval_label_id_list)})') eval_text_ids = [ encoding.ids for encoding in tokenizer.encode_batch(eval_text) ] load_time = time.time() logger.info(f'Loaded data and tokenized in {load_time-start_time}s') print_cuda_memory(args) # Load pre-trained model tokenizer (vocabulary) logger.info('Loading model: %s', args.model_dir) # Load pre-trained model (weights) logger.info(f'Model class: (vanilla) BertForSequenceClassification.') model = BertForSequenceClassification.from_pretrained( args.model_dir, num_labels=num_labels) if torch.cuda.is_available(): torch.cuda.empty_cache() model.to(device) # print(model) # multi-gpu training if args.num_gpus > 1: model = torch.nn.DataParallel(model) logger.info(f'Training model across {args.num_gpus} GPUs') num_train_optimization_steps = int( train_examples_len / args.batch_size / args.grad_accum_steps) * args.num_train_epochs param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_proportion, num_training_steps=num_train_optimization_steps) global_step = 0 logger.info("***** Running training *****") logger.info(" Num examples = %d", train_examples_len) logger.info(" Batch size = %d", args.batch_size) logger.info(" Max sequence length = %d", args.max_seq_len) logger.info(" Num steps = %d", num_train_optimization_steps) wandb.log({'train_examples_len': train_examples_len}) train_input_ids = torch.tensor(train_text_ids, dtype=torch.long) train_label_ids = torch.tensor(train_label_id_list, dtype=torch.long) train_data = TensorDataset(train_input_ids, train_label_ids) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.batch_size) eval_input_ids = torch.tensor(eval_text_ids, dtype=torch.long) eval_label_ids = torch.tensor(eval_label_id_list, dtype=torch.long) eval_data = TensorDataset(eval_input_ids, eval_label_ids) eval_sampler = RandomSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size) def get_eval_acc(): correct = 0 total = 0 for input_ids, label_ids in tqdm.tqdm(eval_dataloader, desc="Evaluating accuracy"): input_ids = input_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids)[0] correct += (logits.argmax(dim=1) == label_ids).sum() total += len(label_ids) return float(correct) / total def save_model(): model_to_save = model.module if hasattr( model, 'module') else model # Only save the model itself # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, args.weights_name) output_config_file = os.path.join(args.output_dir, args.config_name) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) logger.info( f'Best acc found. Saved tokenizer, model config, and model to {args.output_dir}.' ) global_step = 0 def save_model_checkpoint(checkpoint_name=None): # Save model checkpoint checkpoint_name = checkpoint_name or 'checkpoint-{}'.format( global_step) output_dir = os.path.join(args.output_dir, checkpoint_name) if not os.path.exists(output_dir): os.makedirs(output_dir) # Take care of distributed/parallel training model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info('Checkpoint saved to %s.', output_dir) print_cuda_memory(args) model.train() best_eval_acc = 0 steps_since_best_eval_acc = 0 def loss_backward(loss): if args.num_gpus > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.grad_accum_steps > 1: loss = loss / args.grad_accum_steps loss.backward() for epoch in tqdm.trange(int(args.num_train_epochs), desc="Epoch"): prog_bar = tqdm.tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate(prog_bar): print_cuda_memory(args) batch = tuple(t.to(device) for t in batch) input_ids, labels = batch logits = model(input_ids)[0] loss_fct = torch.nn.CrossEntropyLoss() loss = torch.nn.CrossEntropyLoss()(logits.view(-1, num_labels), labels.view(-1)) if global_step % args.tb_writer_step == 0: tb_writer.add_scalar('loss', loss, global_step) tb_writer.add_scalar('lr', loss, global_step) loss_backward(loss) prog_bar.set_description(f"Loss {loss.item()}") if (step + 1) % args.grad_accum_steps == 0: optimizer.step() scheduler.step() optimizer.zero_grad() global_step += 1 # Save model checkpoint to file. if global_step % args.checkpoint_steps == 0: save_model_checkpoint() model.zero_grad() # Inc step counter. global_step += 1 # Check accuracy after each epoch. eval_acc = get_eval_acc() tb_writer.add_scalar('epoch_eval_acc', eval_acc, global_step) wandb.log({'epoch_eval_acc': eval_acc, 'epoch': epoch}) if args.checkpoint_every_epoch: save_model_checkpoint(f'epoch-{epoch}') logger.info(f'Eval acc: {eval_acc*100}%') if eval_acc > best_eval_acc: best_eval_acc = eval_acc steps_since_best_eval_acc = 0 save_model() else: steps_since_best_eval_acc += 1 if (args.early_stopping_epochs > 0) and ( steps_since_best_eval_acc > args.early_stopping_epochs): logger.info( f'Stopping early since it\'s been {args.early_stopping_epochs} steps since validation acc increased' ) break
def main(): parser = ArgumentParser('GLUE evaluation example') parser.add_argument( '--glue_dir', type=str, metavar='PATH', required=True, help='Path to directory containing the GLUE tasks data.') parser.add_argument( '--output_dir', type=str, metavar='PATH', required=True, help= 'Path to the output directory (for logs, checkpoints, parameters, etc.).' ) parser.add_argument('-f', '--force', action='store_true', help='Overwrite output_dir if it already exists.') parser.add_argument( '--task_name', type=str, default=None, choices=GLUE_TASKS, help='The specific GLUE task to train and/or evaluate on.') parser.add_argument('--do_train', action='store_true', help='Whether to run training.') parser.add_argument('--do_eval', action='store_true', help='Whether to run eval (on the dev set).') parser.add_argument('--config_file', type=str, metavar='PATH', required=True, help='Path to the model configuration.') parser.add_argument('--weights_file', type=str, metavar='PATH', required=True, help='Path to the model initialization weights.') parser.add_argument('--tokenizer_vocab_file', type=str, metavar='PATH', required=True, help='Path to the tokenizer vocabulary.') parser.add_argument('--overwrite_cache', action='store_true', help='Overwrite the cache if it already exists.') parser.add_argument('--max_sequence_len', type=int, default=128, metavar='N', help='The maximum length of a sequence.') parser.add_argument('--do_lower_case', action='store_true', help='Whether to lowercase the input when tokenizing.') parser.add_argument('-n', '--num_epochs', type=int, default=3, metavar='N', help='The number of distillation epochs.') parser.add_argument('--per_gpu_train_batch_size', type=int, default=8, metavar='N', help='The batch size per GPU used during training.') parser.add_argument('--per_gpu_eval_batch_size', type=int, default=8, metavar='N', help='The batch size per GPU used during evaluation.') parser.add_argument('-lr', '--learning_rate', type=float, default=2e-5, metavar='F', help='The initial learning rate.') parser.add_argument('--epsilon', type=float, default=1e-8, metavar='F', help="Adam's epsilon.") parser.add_argument('--warmup_prop', type=float, default=0.05, metavar='F', help='Linear warmup proportion.') parser.add_argument( '--num_gradient_accumulation_steps', type=int, default=1, metavar='N', help= 'The number of gradient accumulation steps (for larger batch sizes).') parser.add_argument('--max_gradient_norm', type=float, default=1.0, metavar='F', help='The maximum gradient norm.') parser.add_argument('--seed', type=int, default=42, metavar='N', help='Random seed.') parser.add_argument('-c', '--use_cuda', action='store_true', help='Whether to use cuda or not.') parser.add_argument( '-d', '--use_distributed', action='store_true', help='Whether to use distributed training (distillation) or not.') parser.add_argument('--local_rank', type=int, default=-1, metavar='N', help='Local process rank.') params = parser.parse_args() if not params.use_distributed: params.local_rank = 0 params.train_batch_size = params.per_gpu_train_batch_size params.eval_batch_size = params.per_gpu_eval_batch_size else: params.num_gpus = torch.cuda.device_count() params.train_batch_size = params.per_gpu_train_batch_size * params.num_gpus params.eval_batch_size = params.per_gpu_eval_batch_size * params.num_gpus params.is_master = params.local_rank == 0 if params.use_cuda: device = torch.device('cuda', params.local_rank) else: device = torch.device('cpu') # make output_dir if Path(params.output_dir).is_dir() and not params.force: raise ValueError( f'Output directory {params.output_dir} already exists. Use `--force` if you want to overwrite it.' ) if params.is_master: Path(params.output_dir).mkdir(parents=True, exist_ok=params.force) # dump params json.dump(vars(params), open(Path(params.output_dir) / 'params.json', 'w'), indent=4, sort_keys=True) params.glue_dir = Path(params.glue_dir) params.output_dir = Path(params.output_dir) params.device = device # initialize multi-GPU if params.use_distributed: if params.is_master: logger.info('Initializing PyTorch distributed') torch.cuda.set_device(params.local_rank) dist.init_process_group(backend='nccl', init_method='env://') # set seed(s) if params.is_master: logger.info('Setting random seed(s)') random.seed(params.seed) np.random.seed(params.seed) torch.manual_seed(params.seed) if params.use_distributed: torch.cuda.manual_seed_all(params.seed) # initialize the tokenizer if params.is_master: logger.info('Initializing the tokenizer') tokenizer = BertWordPieceTokenizer(params.tokenizer_vocab_file, lowercase=params.do_lower_case) # enable truncation and padding tokenizer.enable_truncation(params.max_sequence_len) tokenizer.enable_padding(length=params.max_sequence_len) # go over each task if params.task_name is not None: tasks = [params.task_name] output_dirs = [params.output_dir] else: tasks = GLUE_TASKS output_dirs = [ params.output_dir / task / str(params.seed) for task in tasks ] for task, task_output_dir in zip(tasks, output_dirs): # prepare the GLUE task if params.is_master: logger.info(f'Preparing the {task} GLUE task') # make task_output_dir if task_output_dir.is_dir() and not params.force: raise ValueError( f'Task output directory {task_output_dir} already exists. Use `--force` if you want to overwrite it.' ) if params.is_master: task_output_dir.mkdir(parents=True, exist_ok=params.force) # initialize the model if params.is_master: logger.info(f'{task} - Initializing the model') config = DistilBertConfig.from_pretrained( params.config_file, num_labels=len(GLUE_TASKS_MAPPING[task]['labels']), finetuning_task=task) model = DistilBertForSequenceClassification.from_pretrained( params.weights_file, config=config) # send model to device model = model.to(params.device) # perform the training if params.do_train: # initialize the training dataset if params.is_master: logger.info(f'{task} - Initializing the training dataset') train_dataset = GLUETaskDataset( task=task, glue_dir=params.glue_dir, split='train', tokenizer=tokenizer, overwrite_cache=params.overwrite_cache) # initialize the sampler if params.is_master: logger.info(f'{task} - Initializing the training sampler') train_sampler = DistributedSampler( train_dataset) if params.use_distributed else RandomSampler( train_dataset) # initialize the dataloader if params.is_master: logger.info(f'{task} - Initializing the training dataloader') train_dataloader = DataLoader(dataset=train_dataset, sampler=train_sampler, batch_size=params.train_batch_size) # initialize the optimizer if params.is_master: logger.info(f'{task} - Initializing the optimizer') optimizer = optim.Adam(model.parameters(), lr=params.learning_rate, eps=params.epsilon, betas=(0.9, 0.98)) # initialize the learning rate scheduler if params.is_master: logger.info( f'{task} - Initializing the learning rate scheduler') num_steps_epoch = len(train_dataloader) num_train_steps = math.ceil( num_steps_epoch / params.num_gradient_accumulation_steps * params.num_epochs) num_warmup_steps = math.ceil(num_train_steps * params.warmup_prop) def lr_lambda(current_step): if current_step < num_warmup_steps: return float(current_step) / float(max( 1, num_warmup_steps)) return max( 0.0, float(num_train_steps - current_step) / float(max(1, num_train_steps - num_warmup_steps))) lr_scheduler = optim.lr_scheduler.LambdaLR(optimizer=optimizer, lr_lambda=lr_lambda, last_epoch=-1) # initialize distributed data parallel (DDP) if params.use_distributed: if params.is_master: logger.info('Initializing DDP') model = DDP(model, device_ids=[params.local_rank], output_device=params.local_rank) # start training if params.is_master: logger.info(f'{task} - Starting the training') train(task=task, model=model, dataloader=train_dataloader, optimizer=optimizer, num_epochs=params.num_epochs, lr_scheduler=lr_scheduler, num_gradient_accumulation_steps=params. num_gradient_accumulation_steps, max_gradient_norm=params.max_gradient_norm, device=params.device, use_distributed=params.use_distributed, is_master=params.is_master, use_tqdm=True, logger=logger) # save the finetuned model if params.is_master: # take care of distributed training model_to_save = model.module if hasattr(model, 'module') else model model_to_save.config.architectures = [ model_to_save.__class__.__name__ ] logger.info(f'{task} - Saving the finetuned model config') json.dump(vars(model_to_save.config), open(task_output_dir / TRAINED_CONFIG_FILE_TEMPLATE.format( model_name=model_to_save.__class__.__name__, task=task), mode='w'), indent=4, sort_keys=True) logger.info(f'{task} - Saving the finetuned model weights') torch.save( model_to_save.state_dict(), task_output_dir / TRAINED_WEIGHTS_FILE_TEMPLATE.format( model_name=model_to_save.__class__.__name__, task=task)) # reload the model if params.do_eval: if params.is_master: logger.info(f'{task} - Reloading the model') config = DistilBertConfig.from_pretrained( str(task_output_dir / TRAINED_CONFIG_FILE_TEMPLATE.format( model_name=model_to_save.__class__.__name__, task=task)), num_labels=len(GLUE_TASKS_MAPPING[task]['labels']), finetuning_task=task) model = DistilBertForSequenceClassification.from_pretrained( str(task_output_dir / TRAINED_WEIGHTS_FILE_TEMPLATE.format( model_name=model_to_save.__class__.__name__, task=task)), config=config) model = model.to(params.device) # perform the evaluation if params.do_eval and params.is_master: # initialize the evaluation dataset logger.info(f'{task} - Initializing the evaluation dataset') eval_datasets = [ GLUETaskDataset(task=task, glue_dir=params.glue_dir, split='dev', tokenizer=tokenizer, overwrite_cache=params.overwrite_cache) ] # hot fix for MNLI-MM if task == 'MNLI': eval_datasets.append( GLUETaskDataset(task='MNLI-MM', glue_dir=params.glue_dir, split='dev', tokenizer=tokenizer)) for eval_dataset in eval_datasets: # initialize the sampler logger.info( f'{eval_dataset.task} - Initializing the evaluation sampler' ) eval_sampler = SequentialSampler(eval_dataset) # initialize the dataloader logger.info( f'{eval_dataset.task} - Initializing the evaluation dataloader' ) eval_dataloader = DataLoader(dataset=eval_dataset, sampler=eval_sampler, batch_size=params.eval_batch_size) # start evaluating logger.info(f'{eval_dataset.task} - Starting the evaluation') results = evaluate(task=task, model=model, dataloader=eval_dataloader, device=params.device, use_tqdm=True) # log results logger.info(f'{eval_dataset.task} - Evaluation results:') for key, result in results.items(): logger.info(f'{eval_dataset.task} - {key}: {result}') # dump results json.dump(results, open( task_output_dir / RESULTS_FILE_TEMPLATE.format( model_name=model.__class__.__name__, task=eval_dataset.task), 'w'), indent=4) if params.is_master: logger.info(f'Done with the {task} GLUE task')
from google.cloud import storage import tokenizers from transformers import BertTokenizer from tokenizers import BertWordPieceTokenizer from torch.utils.data import Dataset, TensorDataset, DataLoader from torch.utils.data.sampler import RandomSampler import numpy as np import random import jieba import logging logging.getLogger("jieba").setLevel(logging.WARNING) tokenizer = BertWordPieceTokenizer(vocab_file='../tokenizer/vocab.txt') tokenizer.add_special_tokens(["<nl>"]) tokenizer.enable_truncation(max_length=512) tokenizer.enable_padding(length=512) client = storage.Client() blobs = [] size = 0 for blob in client.list_blobs('tfrc-tfrc', prefix='public_model/corpus/'): if (blob.name.endswith('.txt')): blobs.append(blob) sub_blobs = random.sample(blobs, 5) def iterator_gen(generator, handler=None, parallel=False): try: import gc import multiprocessing as multiprocessing if parallel:
# tokenizer.save('./', 'token_test') # else: # tokenizer = ByteLevelBPETokenizer( "./{}-vocab.json".format('token_test'), "./{}-merges.txt".format('token_test'), # add_prefix_space=True, # ) # # Now we can encode # encoded = tokenizer.encode("will be back later. http://plurk.com/p/rp3k7,will be back later, loooove u @mahboi #blessed") # print(encoded.tokens) # print(encoded.offsets) from tokenizers import BertWordPieceTokenizer # My arbitrary sentence sentence = "[CLS] will be back later. www.facebook.com ,will be back later, loooove u @mahboi #blessed" # Bert vocabularies # Instantiate a Bert tokenizers tokenizer = BertWordPieceTokenizer("bert-large-uncased-vocab.txt", lowercase=True, clean_text=True) tokenizer.add_tokens(['[LINK]']) tokenizer.enable_padding(max_length=100) WordPieceEncoder = tokenizer.encode(sentence) # Print the ids, tokens and offsets print(WordPieceEncoder.ids) print(WordPieceEncoder.tokens) print(WordPieceEncoder.offsets) print(tokenizer.get_vocab()['[PAD]']) print(tokenizer.decode(WordPieceEncoder.ids))
class Tweets(Dataset): def __init__(self, device='cpu', pad=150, test=False, N=4): self.samples = [] self.pad = pad self.tokenizer = BertWordPieceTokenizer( "./data/bert-base-uncased-vocab.txt", lowercase=True, clean_text=True) self.tokenizer.enable_padding(max_length=pad - 1) # -1 for sentiment token self.tokenizer.add_special_tokens(['[POS]']) self.tokenizer.add_special_tokens(['[NEG]']) self.tokenizer.add_special_tokens(['[NEU]']) self.vocab = self.tokenizer.get_vocab() self.sent_t = { 'positive': self.tokenizer.token_to_id('[POS]'), 'negative': self.tokenizer.token_to_id('[NEG]'), 'neutral': self.tokenizer.token_to_id('[NEU]') } self.pos_set = {'UNK': 0} all_pos = load('help/tagsets/upenn_tagset.pickle').keys() for i, p in enumerate(all_pos): self.pos_set[p] = i + 1 self.tweet_tokenizer = TweetTokenizer() data = None if test is True: data = pd.read_csv(TEST_PATH).values for row in data: tid, tweet, sentiment = tuple(row) pos_membership = [0] * len(tweet) pos_tokens = self.tweet_tokenizer.tokenize(tweet) pos = nltk.pos_tag(pos_tokens) offset = 0 for i, token in enumerate(pos_tokens): start = tweet.find(token, offset) end = start + len(token) if pos[i][1] in self.pos_set: pos_membership[start:end] = [self.pos_set[pos[i][1]] ] * len(token) offset += len(token) tokens = self.tokenizer.encode(tweet) word_to_index = tokens.ids offsets = tokens.offsets token_pos = [0] * len(word_to_index) # get pos info for i, (s, e) in enumerate(offsets): if word_to_index[i] == 0 or word_to_index[ i] == 101 or word_to_index[i] == 102: pass elif s != e: sub = pos_membership[s:e] token_pos[i] = max(set(sub), key=sub.count) token_pos = [0] + token_pos word_to_index = [self.sent_t[sentiment]] + word_to_index offsets = [(0, 0)] + offsets offsets = np.array([[off[0], off[1]] for off in offsets]) word_to_index = np.array(word_to_index) token_pos = np.array(token_pos) self.samples.append({ 'tid': tid, 'sentiment': sentiment, 'tweet': word_to_index, 'offsets': offsets, 'raw_tweet': tweet, 'pos': token_pos }) else: data = pd.read_csv(TRAIN_PATH).values if N > 0: data = augment_n(data, N=N) for row in data: tid, tweet, selection, sentiment = tuple(row) char_membership = [0] * len(tweet) pos_membership = [0] * len(tweet) si = tweet.find(selection) if si < 0: char_membership[0:] = [1] * len(char_membership) else: char_membership[si:si + len(selection)] = [1] * len(selection) pos_tokens = self.tweet_tokenizer.tokenize(tweet) pos = nltk.pos_tag(pos_tokens) offset = 0 for i, token in enumerate(pos_tokens): start = tweet.find(token, offset) end = start + len(token) if pos[i][1] in self.pos_set: pos_membership[start:end] = [self.pos_set[pos[i][1]] ] * len(token) offset += len(token) tokens = self.tokenizer.encode(tweet) word_to_index = tokens.ids offsets = tokens.offsets token_membership = [0] * len(word_to_index) token_pos = [0] * len(word_to_index) # Inclusive indices start = None end = None for i, (s, e) in enumerate(offsets): if word_to_index[i] == 0 or word_to_index[ i] == 101 or word_to_index[i] == 102: token_membership[i] = -1 elif sum(char_membership[s:e]) > 0: token_membership[i] = 1 if start is None: start = i + 1 end = i + 1 # get pos info for i, (s, e) in enumerate(offsets): if word_to_index[i] == 0 or word_to_index[ i] == 101 or word_to_index[i] == 102: pass elif s != e: sub = pos_membership[s:e] token_pos[i] = max(set(sub), key=sub.count) if start is None: print("Data Point Error") print(tweet) print(selection) continue # token_membership = torch.LongTensor(token_membership).to(device) word_to_index = [self.sent_t[sentiment]] + word_to_index token_membership = [-1] + token_membership offsets = [(0, 0)] + offsets token_pos = [0] + token_pos offsets = np.array([[off[0], off[1]] for off in offsets]) word_to_index = np.array(word_to_index) token_membership = np.array(token_membership).astype('float') token_pos = np.array(token_pos) if tid is None: raise Exception('None field detected') if sentiment is None: raise Exception('None field detected') if word_to_index is None: raise Exception('None field detected') if token_membership is None: raise Exception('None field detected') if selection is None: raise Exception('None field detected') if tweet is None: raise Exception('None field detected') if start is None: raise Exception('None field detected') if end is None: raise Exception('None field detected') if offsets is None: raise Exception('None field detected') self.samples.append({ 'tid': tid, 'sentiment': sentiment, 'tweet': word_to_index, 'selection': token_membership, 'raw_selection': selection, 'raw_tweet': tweet, 'start': start, 'end': end, 'offsets': offsets, 'pos': token_pos }) def get_splits(self, val_size=.3): N = len(self.samples) indices = np.random.permutation(N) split = int(N * (1 - val_size)) train_indices = indices[0:split] valid_indices = indices[split:] return train_indices, valid_indices def k_folds(self, k=5): N = len(self.samples) indices = np.random.permutation(N) return np.array_split(indices, k) def __len__(self): return len(self.samples) def __getitem__(self, idx): try: return self.samples[idx] except TypeError: pass return [self.samples[i] for i in idx]