def __init__(self, t: PreTrainedTokenizer, args, file_path: str, block_size=512): assert os.path.isfile(file_path) logger.info("Creating features from dataset file at %s", file_path) # -------------------------- CHANGES START bert_tokenizer = os.path.join(args.tokenizer_name, "vocab.txt") if os.path.exists(bert_tokenizer): logger.info("Loading BERT tokenizer") from tokenizers import BertWordPieceTokenizer tokenizer = BertWordPieceTokenizer(os.path.join(args.tokenizer_name, "vocab.txt"), handle_chinese_chars=False, lowercase=False) tokenizer.enable_truncation(512) else: from tokenizers import ByteLevelBPETokenizer from tokenizers.processors import BertProcessing logger.info("Loading RoBERTa tokenizer") tokenizer = ByteLevelBPETokenizer( os.path.join(args.tokenizer_name, "vocab.json"), os.path.join(args.tokenizer_name, "merges.txt") ) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_truncation(max_length=512) logger.info("Reading file %s", file_path) with open(file_path, encoding="utf-8") as f: lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())] logger.info("Running tokenization") self.examples = tokenizer.encode_batch(lines)
def train_tokenizer(captions): print('Create training file...') train_tokenizer = [sample for samples in captions for sample in samples] with open('train_tokenizer.txt', 'a') as f: for sample in train_tokenizer: f.write(sample) # init bwpt = BertWordPieceTokenizer(vocab_file=None, unk_token='[UNK]', sep_token='[SEP]', cls_token='[CLS]', clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=True, wordpieces_prefix='##') print('Tokenizer training...') bwpt.train(files=['train_tokenizer.txt'], vocab_size=30000, min_frequency=5, limit_alphabet=1000, special_tokens=['[PAD]', '[UNK]', '[CLS]', '[MASK]', '[SEP]']) bwpt.save('.', 'captions') # initialization of a trained tokenizer tokenizer = BertWordPieceTokenizer('captions-vocab.txt') tokenizer.enable_truncation(max_length=16) print('Tokenizer is ready to use...') return tokenizer
def tokenize_and_cache_data(data_dir, output_dir, tokenizer=None, tokenizer_path=None, n_sentences=0, use_overflow=False, two_segments=True, delete_existing=False, max_length=512): if not tokenizer: tokenizer = BertWordPieceTokenizer(tokenizer_path) tokenizer.enable_truncation(max_length=max_length) tokenizer.enable_padding(max_length=max_length) num_tokens = 0 num_examples = 0 if delete_existing: rmtree(output_dir) os.makedirs(output_dir, exist_ok=True) pbar = tqdm(os.listdir(data_dir)) for path in pbar: result = process_one_file(data_dir, path, tokenizer, output_dir, n_sentences, use_overflow, two_segments) num_examples += result['num_examples'] num_tokens += result['num_tokens'] pbar.set_description( f"{num_tokens} tokens, {num_examples} examples, {num_tokens/(num_examples*max_length)} non-pad tokens" )
def get_transformer_tokenizer(vocab_path, max_tokens, device="cpu"): """ Return a tokenizer to be used with Transformer-based models """ wp_tokenizer = BertWordPieceTokenizer(vocab_path, lowercase=True) wp_tokenizer.enable_padding(direction="right", pad_type_id=1) wp_tokenizer.enable_truncation(max_tokens) return TransformerSquadTokenizer(wp_tokenizer, device=device)
def __init__(self, tokenizer: AutoTokenizer, file_path: str, args): print(file_path) assert os.path.isfile(file_path) directory, filename = os.path.split(file_path) cached_features_file = os.path.join( directory, args.bert_model_type + "_cached_mlm_" + filename) if os.path.exists(cached_features_file): print("Loading features from cached file %s", cached_features_file) with open(cached_features_file, "rb") as handle: self.samples = torch.load(handle) else: print("Creating features from dataset file at %s", directory) # Get the faster tokenizer from tokenizers package tokenizer.save_vocabulary(vocab_path='.') fast_tokenizer = BertWordPieceTokenizer("vocab.txt", lowercase=args.lowercase) fast_tokenizer.enable_truncation(tokenizer.max_len) fast_tokenizer.enable_padding(max_length=tokenizer.max_len, pad_token=tokenizer.pad_token) self.samples = [] # Load data over here df = pd.read_json(file_path) print('SQUAD data: ') for _, row in tqdm(df.iterrows(), total=df.shape[0]): for paragraph in row['data']['paragraphs']: context = paragraph['context'] for qa_pair in paragraph['qas']: question = qa_pair['question'] batch = fast_tokenizer.encode(question, context) self.samples.append({ 'input_ids': batch.ids, 'attention_mask': batch.attention_mask }) for encoding in batch.overflowing: self.samples.append({ 'input_ids': encoding.ids, 'attention_mask': encoding.attention_mask }) df = None print("Saving features into cached file: ", cached_features_file) with open(cached_features_file, "wb") as handle: torch.save(self.samples, handle, pickle_protocol=pickle.HIGHEST_PROTOCOL)
def tokenize(texts: pd.Series, tokenizer: BertWordPieceTokenizer, chunk_size: int = 240, maxlen: int = 512) -> np.array: '''Tokenize input text, return in a form of array''' tokenizer.enable_truncation(max_length=maxlen) try: tokenizer.enable_padding(max_length=maxlen) except TypeError: tokenizer.enable_padding(length=maxlen) all_ids = [] for i in range(0, len(texts), chunk_size): text_chunk = texts[i:i + chunk_size].tolist() encs = tokenizer.encode_batch(text_chunk) all_ids.extend([enc.ids for enc in encs]) return np.array(all_ids)
class BERT16SDataset(Dataset): """ A torch dataset class designed to load a 16S data found in a tsv file and encode it for BERT. :param vocab_path: str, path to the pre-trained bert tokenizer vocab file. :param data_path: str, path to the 16S data file. :param block_size: str, maximal BERT input (an encoded sample will be padded to this length if too short) :param max_word_length: int, the maximal word length the tokenizer can encode. """ def __init__(self, vocab_path: str, data_path: str, block_size=512, max_word_length=100): assert os.path.isfile(data_path) assert os.path.isfile(vocab_path) _logger.info(f"Loading BERT tokenizer using vocab file {vocab_path}") self.tokenizer = BertWordPieceTokenizer( vocab_path, handle_chinese_chars=False, lowercase=False) self.tokenizer.enable_truncation(block_size) self.tokenizer.enable_padding(max_length=block_size) _logger.info(f"Loading 16S dataset file at {data_path}...") self._16s_corpus_df = pd.read_csv(data_path, sep='\t') _logger.info(f"16S corpus is of shape {self._16s_corpus_df.shape}") self.samples = self._16s_corpus_df.seq.values.tolist() self.max_word_length = max_word_length def __len__(self): return len(self._16s_corpus_df) def __getitem__(self, i): sample = self._split_sequence_by_max_word_length(self.samples[i]) tokens = self.tokenizer.encode(sample) return torch.tensor(tokens.ids, dtype=torch.long) def _split_sequence_by_max_word_length(self, seq): """ split a 16S sequence (~1K long usually) into white-spaces separated chunks that the tokenizer can encode. :param seq: str, 16S sequence :return: str """ chunks = [seq[i: i + self.max_word_length] for i in range(0, len(seq), self.max_word_length)] return ' '.join(chunks)
def get_preds(list_of_texts): transformer_layer = (transformers.TFDistilBertModel.from_pretrained( 'distilbert-base-multilingual-cased')) model = build_model(transformer_layer, max_len=MAX_LEN) model.load_weights('model/weights') #model = tf.keras.models.load_model('model') print('weights loaded') tokenizer = transformers.DistilBertTokenizer.from_pretrained( 'distilbert-base-multilingual-cased') tokenizer.save_pretrained('.') # Reload it with the huggingface tokenizers library fast_tokenizer = BertWordPieceTokenizer('vocab.txt', lowercase=False) fast_tokenizer.enable_truncation(max_length=MAX_LEN) fast_tokenizer.enable_padding(length=MAX_LEN) all_ids = [] encs = fast_tokenizer.encode_batch(list_of_texts) all_ids.extend([enc.ids for enc in encs]) all_ids = np.array(all_ids).astype(np.float32) to_predict = create_test(all_ids) predictions = model.predict(to_predict) #print(predictions*10) for prediction in predictions: print(prediction) dic = {'predictions': predictions} parsed = [] #response = pd.DataFrame(dic) #parsed = response.to_json(orient = 'columns') #not sure if works #json.dumps(parsed) #to be reviewed return parsed, predictions
class Tokenizer: def __init__(self, lang): """ A Tokenizer class to load and train a custom tokenizer Using the Hugging Face tokenization library for the same """ self.tokenizer_dir = r"data/{}".format(lang) if not os.path.exists(self.tokenizer_dir): os.mkdir(self.tokenizer_dir) self.vocab = self.tokenizer_dir + "/vocab.txt" if os.path.exists(self.vocab): print("Initialized tokenizer using cached vocab file {}".format(self.vocab)) self.tokenizer = BertWordPieceTokenizer(vocab_file=self.vocab) else: self.tokenizer = BertWordPieceTokenizer() self.tokenizer.enable_padding(max_length=MAX_LENGTH) self.tokenizer.enable_truncation(max_length=MAX_LENGTH) def train_tokenizer(self, sentences): """ Train a tokenizer with a list of sentences """ if not os.path.exists(self.vocab): print("Training tokenizer for {}".format(self.tokenizer_dir)) # Hugging Face only accepts a Temp File with sentences for Training Tokenizer with open(self.tokenizer_dir + "/data.txt", "w+", encoding="utf-8") as f: [f.write(i + "\n") for i in sentences] self.tokenizer.train([self.tokenizer_dir + "/data.txt"]) self.tokenizer.save(self.tokenizer_dir) print("Trained a tokenizer with vocab size {}".format(self.tokenizer.get_vocab_size())) # Removing the temp file os.remove(self.tokenizer_dir + "/data.txt") def encode(self, decoded): return self.tokenizer.encode(decoded) def decode(self, encoded): return self.tokenizer.decode_batch(encoded)
class SentimentModel: def __init__(self, model_dir): # load session and graph self.sess = tf.Session(graph=tf.Graph()) tf.saved_model.loader.load(self.sess, ['serve'], export_dir=model_dir) self.tokenizer = BertWordPieceTokenizer(os.path.join(model_dir, 'vocab.txt')) self.tokenizer.enable_truncation(max_length=MAX_LEN) def predict(self, text): tokenized = self.tokenizer.encode(text) token_ids = tokenized.ids segment_ids = tokenized.type_ids token_ids, segment_ids = np.array([token_ids]), np.array([segment_ids]) # placeholder input_token = self.sess.graph.get_tensor_by_name('Input-Token:0') input_segment = self.sess.graph.get_tensor_by_name('Input-Segment:0') output = self.sess.graph.get_tensor_by_name('label/Softmax:0') probas = self.sess.run([output], feed_dict={input_token: token_ids, input_segment: segment_ids}) return tuple(probas[0][0].tolist())
def train_tokenizer(filename, params): """ Train a BertWordPieceTokenizer with the specified params and save it """ # Get tokenization params save_location = params["tokenizer_path"] max_length = params["max_length"] min_freq = params["min_freq"] vocabsize = params["vocab_size"] tokenizer = BertWordPieceTokenizer() tokenizer.do_lower_case = False special_tokens = ["[S]","[PAD]","[/S]","[UNK]","[MASK]", "[SEP]","[CLS]"] tokenizer.train(files=[filename], vocab_size=vocabsize, min_frequency=min_freq, special_tokens = special_tokens) tokenizer._tokenizer.post_processor = BertProcessing(("[SEP]", tokenizer.token_to_id("[SEP]")), ("[CLS]", tokenizer.token_to_id("[CLS]")),) tokenizer.enable_truncation(max_length=max_length) print("Saving tokenizer ...") if not os.path.exists(save_location): os.makedirs(save_location) tokenizer.save(save_location)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.") ## Other parameters parser.add_argument("--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument("--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name") parser.add_argument("--max_seq_length", default=512, type=int, help="The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--evaluate_during_training", action='store_true', help="Rul evaluation during training at each logging step.") parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument("--num_eval_docs", default=1000, type=int, help="number of docs per query in eval set.") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--max_steps", default=-1, type=int, help="If > 0: set total number of training steps to perform. Override num_train_epochs.") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument('--logging_steps', type=int, default=50, help="Log every X updates steps.") parser.add_argument('--save_steps', type=int, default=50, help="Save checkpoint every X updates steps.") parser.add_argument("--eval_all_checkpoints", action='store_true', help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number") parser.add_argument("--no_cuda", action='store_true', help="Avoid using CUDA when available") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument("--msmarco_output", action='store_true', help="Return msmarco output format file") parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") args = parser.parse_args() if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir: raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir)) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() args.n_gpu=1 else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl') args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt = '%m/%d/%Y %H:%M:%S', level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s", # args.local_rank, device, args.n_gpu, bool(args.local_rank != -1)) # Set seed set_seed(args) num_labels=2 config = BertConfig.from_pretrained("bert-base-uncased", num_labels=num_labels) tokenizer = BertWordPieceTokenizer(f"{args.data_dir}/bert_based_uncased_vocab.txt", lowercase=True) tokenizer.enable_truncation(args.max_seq_length) tokenizer.enable_padding('right',max_length=args.max_seq_length) model = BertForSequenceClassification.from_pretrained('bert-base-uncased', config=config) model.to(args.device) args.output_mode='classification' logger.info("Training/evaluation parameters %s", args) if args.do_train: dataset_path = f'{args.data_dir}/triples.unique.eq.train.small.csv' train_dataset=LazyTextDataset(dataset_path, tokenizer,args.max_seq_length) global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) # Evaluation results = {} if args.do_eval and args.local_rank in [-1, 0]: tokenizer = BertWordPieceTokenizer(f"{args.data_dir}/bert_based_uncased_vocab.txt", lowercase=True) tokenizer.enable_truncation(args.max_seq_length) tokenizer.enable_padding('right',max_length=args.max_seq_length) checkpoints = [args.output_dir] # can specifiy only one checkpoint checkpoints = [f'{args.data_dir}/checkpoint-{args.checkpoint}'] if args.eval_all_checkpoints: checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True))) logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else "" prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else "" model = BertForSequenceClassification.from_pretrained(checkpoint) model.to(args.device) evaluate(args, model, tokenizer, prefix=prefix, set_name='eval', global_step) return results if __name__ == "__main__": main()
# FAZ O DOWNLOAD DO PRE-TREINADO EM PT-BT if not os.path.exists('bert-base-portuguese-cased_pytorch_checkpoint.zip'): wget.download("https://neuralmind-ai.s3.us-east-2.amazonaws.com/nlp/bert-base-portuguese-cased/bert-base-portuguese-cased_pytorch_checkpoint.zip") !unzip bert-base-portuguese-cased_pytorch_checkpoint.zip -d bert-portuguese # CRIA O TOKENIZER A PARTIR DE UM VOCABULÁRIO # LOWERCASE = FALSE (NÃO IRÁ CONVERTER AS ENTRADAS PARA LOWERCASE. MANTEM O ORGINIAL) # STRIP ACCENTS = FALSE (MANTEM OS ACENTOS) tokenizer = BertWordPieceTokenizer("vocab.txt", lowercase=False, strip_accents=False) # MOSTRA AS INFORMAÇÕES DO TONENIZER print(tokenizer) # PERMITE O TRUNCATION E O PADDING tokenizer.enable_truncation(max_length=60) tokenizer.enable_padding() # TOKENINZA EM BATCH TODAS AS SENTENÇAS # TEM QUE USAR .TOLIST PARA CONVERTER POR LISTA. SENTENCAS É UM ARRAY NUMPY output = tokenizer.encode_batch(sentencas.tolist()) # O TOKENIZER RETORAR UMA LISTA DE OBJETOS DO TIPO TOKENIZER # PRECISAMOS PEGAR OS ATRIBUTOS IDS E MASKS E ADICIONAR PARA LISTAS # OS OBJETOS TEM O ATRIBUTO IDS(IDS), TOKENS (TOKENS) E attention_mask # PRECISAMOS FAXER O FOR PARA PEGAR CADA UM E DEPOIS CRIAR A LISTA ids=[x.ids for x in output] attention_mask = [x.attention_mask for x in output] print(len(ids))
class Vocab: """Regulard vocabulary for holding the conversations and number of words.""" DEFAULT_CONTEXT = 'default' def __init__(self, max_seq_len: int, conversation_depth: int = 4): self.words = {} self._context = Vocab.DEFAULT_CONTEXT self.conversations = {} self._held_conversations = {} self.conversation_depth = conversation_depth self.longest = 0 self.longest_tokenized = 0 self.tokenizer = BertWordPieceTokenizer( 'data/bert-base-uncased-vocab.txt', lowercase=True) self.tokenizer.enable_truncation(max_seq_len) def add_word(self, word: str) -> None: word = word.lower() if not word in self.words: self.words[word] = 0 self.words[word] += 1 def add_sentence(self, sentence: str) -> None: [self.add_word(s) for s in sentence.split()] def switch_context(self, new_context: str) -> None: if self._context in self._held_conversations and len( self._held_conversations[ self._context]) > self.conversation_depth: self.conversations[self._context].append(self._held_conversations[ self._context][-self.conversation_depth:][::-1]) self._context = new_context def add_conversation(self, conversation: Dict[str, object]) -> None: if not self._context in self.conversations: self.conversations[self._context] = [] self.add_line(conversation) lc = self._held_conversations[self._context][-1] line = lc['line'].split() if len(line) > self.longest: self.longest = len(line) tokenized = self.tokenizer.encode(lc['line']) if len(tokenized.ids) > self.longest_tokenized: self.longest_tokenized = len(tokenized.ids) def add_line(self, conversation: Dict[str, object]) -> bool: if not self._context in self._held_conversations or len( self._held_conversations[self._context]) == 0: self._held_conversations[self._context] = [conversation] return True hc = self._held_conversations[self._context] # Held Conversation lc = hc[-1] # Last conversation same_speaker = len(lc['speaker']) > 0 and lc[ 'speaker'] == conversation['speaker'] and lc['speaker'] != 'NTP' continuing_line = (len(lc['speaker']) == 0 or lc['speaker'] == 'NTP') and \ (len(conversation['speaker']) == 0 or conversation['speaker'] == 'NTP') \ and len(conversation['line']) > 0 and conversation['line'][0].islower() if same_speaker or continuing_line and conversation['when'] - lc[ 'when'] < 1000 * 60 * 1.5: hc[-1]['when'] = conversation['when'] hc[-1]['line'] += f" {conversation['line']}" return False if len(self._held_conversations[self._context]) >= 2: if self.conversation_depth > 2: self.conversations[self._context].append(hc[-2:][::-1]) self.conversations[self._context].append( hc[-min(self.conversation_depth, len(hc)):][::-1]) hc.append(conversation) return True def get_tokenizer(self) -> BaseTokenizer: return self.tokenizer
import torch from google.cloud import storage import tokenizers from transformers import BertTokenizer from tokenizers import BertWordPieceTokenizer from torch.utils.data import Dataset, TensorDataset, DataLoader from torch.utils.data.sampler import RandomSampler import numpy as np import random import jieba import logging logging.getLogger("jieba").setLevel(logging.WARNING) tokenizer = BertWordPieceTokenizer(vocab_file='../tokenizer/vocab.txt') tokenizer.add_special_tokens(["<nl>"]) tokenizer.enable_truncation(max_length=512) tokenizer.enable_padding(length=512) client = storage.Client() blobs = [] size = 0 for blob in client.list_blobs('tfrc-tfrc', prefix='public_model/corpus/'): if (blob.name.endswith('.txt')): blobs.append(blob) sub_blobs = random.sample(blobs, 5) def iterator_gen(generator, handler=None, parallel=False): try: import gc import multiprocessing as multiprocessing
model = Model(inputs=input_word_ids, outputs=out) model.compile(Adam(lr=6e-6), loss='binary_crossentropy', metrics=['accuracy', 'AUC']) return model transformer_layer = (transformers.TFDistilBertModel.from_pretrained( 'distilbert-base-multilingual-cased')) model = build_model(transformer_layer, max_len=MAX_LEN) model.load_weights('/home/aziz/vneuron/model/weights') fast_tokenizer = BertWordPieceTokenizer('vocab.txt', lowercase=False) fast_tokenizer.enable_truncation(max_length=MAX_LEN) fast_tokenizer.enable_padding(length=MAX_LEN) app = Flask(__name__) @app.route('/') def index(): return render_template('index.html') @app.route('/predict', methods=['POST']) def predict(): text = request.form['content'] text = [str(text)]
def tokenizer( self) -> Union[BaseTokenizer, CountVectorizer, TfidfVectorizer]: pkl_path = os.path.join(self.tokenizer_path, "model.pkl") if self._tokenizer is not None: return self._tokenizer ### get pickled tokenizer if os.path.exists(pkl_path) and not self.retrain_tokenizer: with open(pkl_path, 'rb') as f: tokenizer = pickle.load(f) ### train new tokenizer else: self.retrain_tokenizer = False if self.algorithm == 'bert': from tokenizers import BertWordPieceTokenizer tokenizer = BertWordPieceTokenizer( vocab_file=None if self._init_vocabulary is None else os. path.join(self.cache_path, "bert_vocab.txt")) tokenizer.enable_truncation(max_length=self.max_length) tokenizer.enable_padding(length=self.max_length) # train the tokenizer if self._init_vocabulary is None: path = os.path.join(self.cache_path, 'train.txt') with open(path, 'w') as f: for i in chain(self.train_text, self.valid_text, self.test_text): if len(i) == 0: continue f.write(i + "\n" if i[-1] != "\n" else i) tokenizer.train(files=path, vocab_size=self.vocab_size, min_frequency=self.min_frequency, limit_alphabet=self.limit_alphabet, show_progress=True) tokenizer.save_model(self.tokenizer_path) elif self.algorithm in ('count', 'tf', 'tfidf'): if self.algorithm == 'count': tokenizer = CountVectorizer( input='content', ngram_range=self.ngram_range, min_df=self.min_frequency, max_df=self.max_frequency, max_features=self.vocab_size, vocabulary=self._init_vocabulary, tokenizer=_simple_tokenizer, stop_words='english') elif self.algorithm in ('tf', 'tfidf'): tokenizer = TfidfVectorizer( input='content', ngram_range=self.ngram_range, min_df=self.min_frequency, max_df=self.max_frequency, max_features=self.vocab_size, stop_words='english', vocabulary=self._init_vocabulary, tokenizer=_simple_tokenizer, use_idf=False if self.algorithm == 'tf' else True) tokenizer.fit((_simple_preprocess(i) for i in chain( self.train_text, self.valid_text, self.test_text))) else: raise NotImplementedError # save the pickled model with open(pkl_path, "wb") as f: pickle.dump(tokenizer, f) ### assign and return self._tokenizer = tokenizer return self._tokenizer
import onnxruntime as ort from tokenizers import BertWordPieceTokenizer # Helper scripts from .PreprocessData import normalize_text, truncate_text from .Predict import get_ids_and_masks, predict # Initialize ONNX runtime and language model tokenizer vocab_file_path = os.path.join(os.path.dirname(__file__), "Model/bert-base-uncased-vocab.txt") onnx_file_path = os.path.join(os.path.dirname(__file__), "Model/watchdog_model.onnx") tokenizer = BertWordPieceTokenizer(vocab_file_path) tokenizer.enable_padding(pad_id=0, pad_token="[PAD]", length=128) tokenizer.enable_truncation(max_length=128) ort_session = ort.InferenceSession(onnx_file_path) def main(req: func.HttpRequest) -> func.HttpResponse: logging.info('Invoked TextQualityWatchdog Skill.') try: body = json.dumps(req.get_json()) if body: logging.info(body) values = json.loads(body)['values'] results = {} results["values"] = []
def train_tokenizer(captions): # инициализируем токенайзер используя сгенерированный словарь tokenizer = BertWordPieceTokenizer('captions-vocab.txt') tokenizer.enable_truncation(max_length=16) return tokenizer
class Vocab: """Regulard vocabulary for holding the conversations and number of words.""" DEFAULT_CONTEXT = 'default' SENTENCE_CUTOFF_DURATION = 1000 * 6 def __init__(self, max_seq_len: int, conversation_depth: int = 4): self.words = {} self._context = Vocab.DEFAULT_CONTEXT self.conversations = {} self._held_conversations = {} self.conversation_depth = conversation_depth self.longest = 0 self.longest_tokenized = 0 self.tokenizer = BertWordPieceTokenizer( 'data/bert-base-uncased-vocab.txt', lowercase=True) self.tokenizer.enable_truncation(max_seq_len) def add_word(self, word: str) -> None: word = word.lower() if not word in self.words: self.words[word] = 0 self.words[word] += 1 def add_sentence(self, sentence: str) -> None: [self.add_word(s) for s in sentence.split()] def switch_context(self, new_context: str) -> None: if self._context in self._held_conversations and len( self._held_conversations[ self._context]) > self.conversation_depth: self.conversations[self._context].append(self._held_conversations[ self._context][-self.conversation_depth:]) self._context = new_context def add_conversation(self, conversation: Dict[str, object]) -> None: if not self._context in self.conversations: self.conversations[self._context] = [] self.add_line(conversation) lc = self._held_conversations[self._context][-1] line = lc['line'].split() if len(line) > self.longest: self.longest = len(line) tokenized = self.tokenizer.encode(lc['line']) if len(tokenized.ids) > self.longest_tokenized: self.longest_tokenized = len(tokenized.ids) def add_line(self, conversation: Dict[str, object]) -> bool: if not self._context in self._held_conversations or len( self._held_conversations[self._context]) == 0: self._held_conversations[self._context] = [conversation] return True hc = self._held_conversations[self._context] # Held Conversation lc = hc[-1] # Last conversation same_speaker = len( lc['speaker']) > 0 and lc['speaker'] == conversation[ 'speaker'] and not lc['speaker'] in ['NTP', 'Text'] continuing_line = (len(lc['speaker']) == 0 or lc['speaker'] in ['NTP', 'Text']) and \ (len(conversation['speaker']) == 0 or conversation['speaker'] in ['NTP', 'Text']) \ and len(conversation['line']) > 0 and conversation['line'][0].islower() if same_speaker or continuing_line and conversation['when'] - lc[ 'when'] < Vocab.SENTENCE_CUTOFF_DURATION: lc['when'] = conversation['when'] lc['line'] += f" {conversation['line']}" return False if len(self._held_conversations[self._context]) >= 2: self.conversations[self._context].append( hc[-min(self.conversation_depth, len(hc)):]) hc.append(conversation) if conversation['when'] - lc['when'] >= Vocab.SENTENCE_CUTOFF_DURATION: self._held_conversations[self._context] = [conversation] return True def get_tokenizer(self) -> BaseTokenizer: return self.tokenizer def get_conversations(self, in_seq_len: int, out_seq_len: int, \ add_two_person: bool = True) -> List[Dict[str,List[int]]]: conversations = [] for conversation in self.conversations.values(): for dialogue in conversation: inputs = [ self.tokenizer.encode(y['line']) for y in dialogue[:-1] ][::-1] target = self.tokenizer.encode(dialogue[-1]['line']) target.pad(out_seq_len) target.truncate(out_seq_len) if add_two_person and self.conversation_depth > 2 and len( dialogue) > 2 and len(inputs) > 0: inputs[0].pad(in_seq_len) inputs[0].truncate(in_seq_len) conversations.append({ 'inputs': inputs[0].ids, 'target': target.ids, 'mask': inputs[0].attention_mask }) inputs = Encoding.merge(inputs) inputs.pad(in_seq_len) inputs.truncate(in_seq_len) conversations.append({ 'inputs': inputs.ids, 'target': target.ids, 'mask': inputs.attention_mask }) return conversations
def main(): start_time = time.time() args = parse_args() make_directories(args.output_dir) # Start Tensorboard and log hyperparams. tb_writer = SummaryWriter(args.output_dir) tb_writer.add_hparams(vars(args), {}) file_log_handler = logging.FileHandler( os.path.join(args.output_dir, 'log.txt')) logger.addHandler(file_log_handler) # Get list of text and list of label (integers) from disk. train_text, train_label_id_list, eval_text, eval_label_id_list = \ get_examples_and_labels(args.dataset) # Augment training data. if (args.augmentation_recipe is not None) and len( args.augmentation_recipe): import pandas as pd if args.augmentation_recipe == 'textfooler': aug_csv = '/p/qdata/jm8wx/research/text_attacks/textattack/outputs/attack-1590551967800.csv' elif args.augmentation_recipe == 'tf-adjusted': aug_csv = '/p/qdata/jm8wx/research/text_attacks/textattack/outputs/attack-1590564015768.csv' else: raise ValueError( f'Unknown augmentation recipe {args.augmentation_recipe}') aug_df = pd.read_csv(aug_csv) # filter skipped outputs aug_df = aug_df[aug_df['original_text'] != aug_df['perturbed_text']] print( f'Augmentation recipe {args.augmentation_recipe} / augmentation num. examples {args.augmentation_num}/ len {len(aug_df)}' ) original_text = aug_df['original_text'] perturbed_text = aug_df['perturbed_text'] # convert `train_text` and `train_label_id_list` to an np array so things are faster train_text = np.array(train_text) train_label_id_list = np.array(train_label_id_list) x_adv_list = [] x_adv_id_list = [] for (x, x_adv) in zip(original_text, perturbed_text): x = x.replace('[[', '').replace(']]', '') x_adv = x_adv.replace('[[', '').replace(']]', '') x_idx = (train_text == x).nonzero()[0][0] x_adv_label = train_label_id_list[x_idx] x_adv_id_list.append(x_adv_label) x_adv_list.append(x_adv) # truncate to `args.augmentation_num` examples if (args.augmentation_num >= 0): perm = list(range(len(x_adv_list))) random.shuffle(perm) perm = perm[:args.augmentation_num] x_adv_list = [x_adv_list[i] for i in perm] x_adv_id_list = [x_adv_id_list[i] for i in perm] train_text = train_text.tolist() + x_adv_list train_label_id_list = train_label_id_list.tolist() + x_adv_id_list print( f'Augmentation added {len(x_adv_list)} examples, for a total of {len(train_text)}' ) label_id_len = len(train_label_id_list) num_labels = len(set(train_label_id_list)) logger.info('num_labels: %s', num_labels) train_examples_len = len(train_text) if len(train_label_id_list) != train_examples_len: raise ValueError( f'Number of train examples ({train_examples_len}) does not match number of labels ({len(train_label_id_list)})' ) if len(eval_label_id_list) != len(eval_text): raise ValueError( f'Number of teste xamples ({len(eval_text)}) does not match number of labels ({len(eval_label_id_list)})' ) print_cuda_memory(args) # old INFO:__main__:Loaded data and tokenized in 189.66675066947937s # @TODO support other vocabularies, or at least, support case tokenizer = BertWordPieceTokenizer('bert-base-uncased-vocab.txt', lowercase=True) tokenizer.enable_padding(max_length=args.max_seq_len) tokenizer.enable_truncation(max_length=args.max_seq_len) logger.info(f'Tokenizing training data. (len: {train_examples_len})') train_text_ids = [ encoding.ids for encoding in tokenizer.encode_batch(train_text) ] logger.info(f'Tokenizing test data (len: {len(eval_label_id_list)})') eval_text_ids = [ encoding.ids for encoding in tokenizer.encode_batch(eval_text) ] load_time = time.time() logger.info(f'Loaded data and tokenized in {load_time-start_time}s') print_cuda_memory(args) # Load pre-trained model tokenizer (vocabulary) logger.info('Loading model: %s', args.model_dir) # Load pre-trained model (weights) logger.info(f'Model class: (vanilla) BertForSequenceClassification.') model = BertForSequenceClassification.from_pretrained( args.model_dir, num_labels=num_labels) if torch.cuda.is_available(): torch.cuda.empty_cache() model.to(device) # print(model) # multi-gpu training if args.num_gpus > 1: model = torch.nn.DataParallel(model) logger.info(f'Training model across {args.num_gpus} GPUs') num_train_optimization_steps = int( train_examples_len / args.batch_size / args.grad_accum_steps) * args.num_train_epochs param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_proportion, num_training_steps=num_train_optimization_steps) global_step = 0 logger.info("***** Running training *****") logger.info(" Num examples = %d", train_examples_len) logger.info(" Batch size = %d", args.batch_size) logger.info(" Max sequence length = %d", args.max_seq_len) logger.info(" Num steps = %d", num_train_optimization_steps) wandb.log({'train_examples_len': train_examples_len}) train_input_ids = torch.tensor(train_text_ids, dtype=torch.long) train_label_ids = torch.tensor(train_label_id_list, dtype=torch.long) train_data = TensorDataset(train_input_ids, train_label_ids) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.batch_size) eval_input_ids = torch.tensor(eval_text_ids, dtype=torch.long) eval_label_ids = torch.tensor(eval_label_id_list, dtype=torch.long) eval_data = TensorDataset(eval_input_ids, eval_label_ids) eval_sampler = RandomSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size) def get_eval_acc(): correct = 0 total = 0 for input_ids, label_ids in tqdm.tqdm(eval_dataloader, desc="Evaluating accuracy"): input_ids = input_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids)[0] correct += (logits.argmax(dim=1) == label_ids).sum() total += len(label_ids) return float(correct) / total def save_model(): model_to_save = model.module if hasattr( model, 'module') else model # Only save the model itself # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, args.weights_name) output_config_file = os.path.join(args.output_dir, args.config_name) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) logger.info( f'Best acc found. Saved tokenizer, model config, and model to {args.output_dir}.' ) global_step = 0 def save_model_checkpoint(checkpoint_name=None): # Save model checkpoint checkpoint_name = checkpoint_name or 'checkpoint-{}'.format( global_step) output_dir = os.path.join(args.output_dir, checkpoint_name) if not os.path.exists(output_dir): os.makedirs(output_dir) # Take care of distributed/parallel training model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info('Checkpoint saved to %s.', output_dir) print_cuda_memory(args) model.train() best_eval_acc = 0 steps_since_best_eval_acc = 0 def loss_backward(loss): if args.num_gpus > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.grad_accum_steps > 1: loss = loss / args.grad_accum_steps loss.backward() for epoch in tqdm.trange(int(args.num_train_epochs), desc="Epoch"): prog_bar = tqdm.tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate(prog_bar): print_cuda_memory(args) batch = tuple(t.to(device) for t in batch) input_ids, labels = batch logits = model(input_ids)[0] loss_fct = torch.nn.CrossEntropyLoss() loss = torch.nn.CrossEntropyLoss()(logits.view(-1, num_labels), labels.view(-1)) if global_step % args.tb_writer_step == 0: tb_writer.add_scalar('loss', loss, global_step) tb_writer.add_scalar('lr', loss, global_step) loss_backward(loss) prog_bar.set_description(f"Loss {loss.item()}") if (step + 1) % args.grad_accum_steps == 0: optimizer.step() scheduler.step() optimizer.zero_grad() global_step += 1 # Save model checkpoint to file. if global_step % args.checkpoint_steps == 0: save_model_checkpoint() model.zero_grad() # Inc step counter. global_step += 1 # Check accuracy after each epoch. eval_acc = get_eval_acc() tb_writer.add_scalar('epoch_eval_acc', eval_acc, global_step) wandb.log({'epoch_eval_acc': eval_acc, 'epoch': epoch}) if args.checkpoint_every_epoch: save_model_checkpoint(f'epoch-{epoch}') logger.info(f'Eval acc: {eval_acc*100}%') if eval_acc > best_eval_acc: best_eval_acc = eval_acc steps_since_best_eval_acc = 0 save_model() else: steps_since_best_eval_acc += 1 if (args.early_stopping_epochs > 0) and ( steps_since_best_eval_acc > args.early_stopping_epochs): logger.info( f'Stopping early since it\'s been {args.early_stopping_epochs} steps since validation acc increased' ) break
def main(): parser = ArgumentParser('GLUE evaluation example') parser.add_argument( '--glue_dir', type=str, metavar='PATH', required=True, help='Path to directory containing the GLUE tasks data.') parser.add_argument( '--output_dir', type=str, metavar='PATH', required=True, help= 'Path to the output directory (for logs, checkpoints, parameters, etc.).' ) parser.add_argument('-f', '--force', action='store_true', help='Overwrite output_dir if it already exists.') parser.add_argument( '--task_name', type=str, default=None, choices=GLUE_TASKS, help='The specific GLUE task to train and/or evaluate on.') parser.add_argument('--do_train', action='store_true', help='Whether to run training.') parser.add_argument('--do_eval', action='store_true', help='Whether to run eval (on the dev set).') parser.add_argument('--config_file', type=str, metavar='PATH', required=True, help='Path to the model configuration.') parser.add_argument('--weights_file', type=str, metavar='PATH', required=True, help='Path to the model initialization weights.') parser.add_argument('--tokenizer_vocab_file', type=str, metavar='PATH', required=True, help='Path to the tokenizer vocabulary.') parser.add_argument('--overwrite_cache', action='store_true', help='Overwrite the cache if it already exists.') parser.add_argument('--max_sequence_len', type=int, default=128, metavar='N', help='The maximum length of a sequence.') parser.add_argument('--do_lower_case', action='store_true', help='Whether to lowercase the input when tokenizing.') parser.add_argument('-n', '--num_epochs', type=int, default=3, metavar='N', help='The number of distillation epochs.') parser.add_argument('--per_gpu_train_batch_size', type=int, default=8, metavar='N', help='The batch size per GPU used during training.') parser.add_argument('--per_gpu_eval_batch_size', type=int, default=8, metavar='N', help='The batch size per GPU used during evaluation.') parser.add_argument('-lr', '--learning_rate', type=float, default=2e-5, metavar='F', help='The initial learning rate.') parser.add_argument('--epsilon', type=float, default=1e-8, metavar='F', help="Adam's epsilon.") parser.add_argument('--warmup_prop', type=float, default=0.05, metavar='F', help='Linear warmup proportion.') parser.add_argument( '--num_gradient_accumulation_steps', type=int, default=1, metavar='N', help= 'The number of gradient accumulation steps (for larger batch sizes).') parser.add_argument('--max_gradient_norm', type=float, default=1.0, metavar='F', help='The maximum gradient norm.') parser.add_argument('--seed', type=int, default=42, metavar='N', help='Random seed.') parser.add_argument('-c', '--use_cuda', action='store_true', help='Whether to use cuda or not.') parser.add_argument( '-d', '--use_distributed', action='store_true', help='Whether to use distributed training (distillation) or not.') parser.add_argument('--local_rank', type=int, default=-1, metavar='N', help='Local process rank.') params = parser.parse_args() if not params.use_distributed: params.local_rank = 0 params.train_batch_size = params.per_gpu_train_batch_size params.eval_batch_size = params.per_gpu_eval_batch_size else: params.num_gpus = torch.cuda.device_count() params.train_batch_size = params.per_gpu_train_batch_size * params.num_gpus params.eval_batch_size = params.per_gpu_eval_batch_size * params.num_gpus params.is_master = params.local_rank == 0 if params.use_cuda: device = torch.device('cuda', params.local_rank) else: device = torch.device('cpu') # make output_dir if Path(params.output_dir).is_dir() and not params.force: raise ValueError( f'Output directory {params.output_dir} already exists. Use `--force` if you want to overwrite it.' ) if params.is_master: Path(params.output_dir).mkdir(parents=True, exist_ok=params.force) # dump params json.dump(vars(params), open(Path(params.output_dir) / 'params.json', 'w'), indent=4, sort_keys=True) params.glue_dir = Path(params.glue_dir) params.output_dir = Path(params.output_dir) params.device = device # initialize multi-GPU if params.use_distributed: if params.is_master: logger.info('Initializing PyTorch distributed') torch.cuda.set_device(params.local_rank) dist.init_process_group(backend='nccl', init_method='env://') # set seed(s) if params.is_master: logger.info('Setting random seed(s)') random.seed(params.seed) np.random.seed(params.seed) torch.manual_seed(params.seed) if params.use_distributed: torch.cuda.manual_seed_all(params.seed) # initialize the tokenizer if params.is_master: logger.info('Initializing the tokenizer') tokenizer = BertWordPieceTokenizer(params.tokenizer_vocab_file, lowercase=params.do_lower_case) # enable truncation and padding tokenizer.enable_truncation(params.max_sequence_len) tokenizer.enable_padding(length=params.max_sequence_len) # go over each task if params.task_name is not None: tasks = [params.task_name] output_dirs = [params.output_dir] else: tasks = GLUE_TASKS output_dirs = [ params.output_dir / task / str(params.seed) for task in tasks ] for task, task_output_dir in zip(tasks, output_dirs): # prepare the GLUE task if params.is_master: logger.info(f'Preparing the {task} GLUE task') # make task_output_dir if task_output_dir.is_dir() and not params.force: raise ValueError( f'Task output directory {task_output_dir} already exists. Use `--force` if you want to overwrite it.' ) if params.is_master: task_output_dir.mkdir(parents=True, exist_ok=params.force) # initialize the model if params.is_master: logger.info(f'{task} - Initializing the model') config = DistilBertConfig.from_pretrained( params.config_file, num_labels=len(GLUE_TASKS_MAPPING[task]['labels']), finetuning_task=task) model = DistilBertForSequenceClassification.from_pretrained( params.weights_file, config=config) # send model to device model = model.to(params.device) # perform the training if params.do_train: # initialize the training dataset if params.is_master: logger.info(f'{task} - Initializing the training dataset') train_dataset = GLUETaskDataset( task=task, glue_dir=params.glue_dir, split='train', tokenizer=tokenizer, overwrite_cache=params.overwrite_cache) # initialize the sampler if params.is_master: logger.info(f'{task} - Initializing the training sampler') train_sampler = DistributedSampler( train_dataset) if params.use_distributed else RandomSampler( train_dataset) # initialize the dataloader if params.is_master: logger.info(f'{task} - Initializing the training dataloader') train_dataloader = DataLoader(dataset=train_dataset, sampler=train_sampler, batch_size=params.train_batch_size) # initialize the optimizer if params.is_master: logger.info(f'{task} - Initializing the optimizer') optimizer = optim.Adam(model.parameters(), lr=params.learning_rate, eps=params.epsilon, betas=(0.9, 0.98)) # initialize the learning rate scheduler if params.is_master: logger.info( f'{task} - Initializing the learning rate scheduler') num_steps_epoch = len(train_dataloader) num_train_steps = math.ceil( num_steps_epoch / params.num_gradient_accumulation_steps * params.num_epochs) num_warmup_steps = math.ceil(num_train_steps * params.warmup_prop) def lr_lambda(current_step): if current_step < num_warmup_steps: return float(current_step) / float(max( 1, num_warmup_steps)) return max( 0.0, float(num_train_steps - current_step) / float(max(1, num_train_steps - num_warmup_steps))) lr_scheduler = optim.lr_scheduler.LambdaLR(optimizer=optimizer, lr_lambda=lr_lambda, last_epoch=-1) # initialize distributed data parallel (DDP) if params.use_distributed: if params.is_master: logger.info('Initializing DDP') model = DDP(model, device_ids=[params.local_rank], output_device=params.local_rank) # start training if params.is_master: logger.info(f'{task} - Starting the training') train(task=task, model=model, dataloader=train_dataloader, optimizer=optimizer, num_epochs=params.num_epochs, lr_scheduler=lr_scheduler, num_gradient_accumulation_steps=params. num_gradient_accumulation_steps, max_gradient_norm=params.max_gradient_norm, device=params.device, use_distributed=params.use_distributed, is_master=params.is_master, use_tqdm=True, logger=logger) # save the finetuned model if params.is_master: # take care of distributed training model_to_save = model.module if hasattr(model, 'module') else model model_to_save.config.architectures = [ model_to_save.__class__.__name__ ] logger.info(f'{task} - Saving the finetuned model config') json.dump(vars(model_to_save.config), open(task_output_dir / TRAINED_CONFIG_FILE_TEMPLATE.format( model_name=model_to_save.__class__.__name__, task=task), mode='w'), indent=4, sort_keys=True) logger.info(f'{task} - Saving the finetuned model weights') torch.save( model_to_save.state_dict(), task_output_dir / TRAINED_WEIGHTS_FILE_TEMPLATE.format( model_name=model_to_save.__class__.__name__, task=task)) # reload the model if params.do_eval: if params.is_master: logger.info(f'{task} - Reloading the model') config = DistilBertConfig.from_pretrained( str(task_output_dir / TRAINED_CONFIG_FILE_TEMPLATE.format( model_name=model_to_save.__class__.__name__, task=task)), num_labels=len(GLUE_TASKS_MAPPING[task]['labels']), finetuning_task=task) model = DistilBertForSequenceClassification.from_pretrained( str(task_output_dir / TRAINED_WEIGHTS_FILE_TEMPLATE.format( model_name=model_to_save.__class__.__name__, task=task)), config=config) model = model.to(params.device) # perform the evaluation if params.do_eval and params.is_master: # initialize the evaluation dataset logger.info(f'{task} - Initializing the evaluation dataset') eval_datasets = [ GLUETaskDataset(task=task, glue_dir=params.glue_dir, split='dev', tokenizer=tokenizer, overwrite_cache=params.overwrite_cache) ] # hot fix for MNLI-MM if task == 'MNLI': eval_datasets.append( GLUETaskDataset(task='MNLI-MM', glue_dir=params.glue_dir, split='dev', tokenizer=tokenizer)) for eval_dataset in eval_datasets: # initialize the sampler logger.info( f'{eval_dataset.task} - Initializing the evaluation sampler' ) eval_sampler = SequentialSampler(eval_dataset) # initialize the dataloader logger.info( f'{eval_dataset.task} - Initializing the evaluation dataloader' ) eval_dataloader = DataLoader(dataset=eval_dataset, sampler=eval_sampler, batch_size=params.eval_batch_size) # start evaluating logger.info(f'{eval_dataset.task} - Starting the evaluation') results = evaluate(task=task, model=model, dataloader=eval_dataloader, device=params.device, use_tqdm=True) # log results logger.info(f'{eval_dataset.task} - Evaluation results:') for key, result in results.items(): logger.info(f'{eval_dataset.task} - {key}: {result}') # dump results json.dump(results, open( task_output_dir / RESULTS_FILE_TEMPLATE.format( model_name=model.__class__.__name__, task=eval_dataset.task), 'w'), indent=4) if params.is_master: logger.info(f'Done with the {task} GLUE task')
print(movie_reviews.columns.values) print(movie_reviews.sentiment.unique()) y = movie_reviews["sentiment"] y = np.array(list(map(lambda x: 1 if x == "positive" else 0, y))) slow_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") save_path = "bert_base_uncased/" if not os.path.exists(save_path): os.makedirs(save_path) slow_tokenizer.save_pretrained(save_path) # Load the fast tokenizer from saved file tokenizer = BertWordPieceTokenizer("bert_base_uncased/vocab.txt", lowercase=True) tokenizer.enable_truncation(MAX_SEQ_LEN - 2) train_count = 40000 # 40000 test_count = 2000 # # X_train = convert_sentences_to_features(reviews[:40000], tokenizer) # X_test = convert_sentences_to_features(reviews[40000:], tokenizer) X_train = convert_sentences_to_features(reviews[:train_count], tokenizer) X_test = convert_sentences_to_features(reviews[train_count : train_count + test_count], tokenizer) one_hot_encoded = to_categorical(y) # one_hot_encoded = tf.one_hot(y, 1) # y_train = one_hot_encoded[:40000] # y_test = one_hot_encoded[40000:]