import os import sys import tokenizers sys.path.append("/usr/src/app/kaggle/tweet-sentiment-extraction") MAX_LEN = 128 TRAIN_BATCH_SIZE = 4 VALID_BATCH_SIZE = 2 EPOCHS = 50 BERT_PATH = "inputs/bert-large-uncased-wwm-finetuned-squad" TOKENIZER = tokenizers.BertWordPieceTokenizer(os.path.join( BERT_PATH, "vocab.txt"), lowercase=True) INPUT_DIR = 'inputs' OUT_DIR = 'models' TRAIN_PATH = os.path.join(INPUT_DIR, "train.csv") TEST_PATH = os.path.join(INPUT_DIR, "test.csv") SAMPLE_PATH = os.path.join(INPUT_DIR, "sample_submission.csv") FOLD0_ONLY = False DEBUG = False
# reference: https://huggingface.co/clagator/biobert_squad2_cased from transformers import BertForQuestionAnswering import torch import time import tokenizers import os BERT_PATH = "model" # print(f"Bert path: {BERT_PATH}") model = BertForQuestionAnswering.from_pretrained(f'{BERT_PATH}/') tokenizer = tokenizers.BertWordPieceTokenizer(f"{BERT_PATH}/vocab.txt", lowercase=True) def get_answer(question, context): tok_text = tokenizer.encode(question, context) tokens = tok_text.tokens start_positions = torch.tensor([0]) end_positions = torch.tensor([1]) input_ids = tok_text.ids token_type_ids = tok_text.type_ids mask = [1] * len(token_type_ids) max_len = 50 padding_length = max_len - len(input_ids) if padding_length > 0: input_ids = input_ids + ([0] * padding_length) mask = mask + ([0] * padding_length) token_type_ids = token_type_ids + ([0] * padding_length)
def __process_data( text_path: str, dst_folder: str, vocab_size: int, tokenizer_type: str, spe_type: str, spe_character_coverage: float, spe_train_extremely_large_corpus: bool, spe_sample_size: int, spe_max_sentencepiece_length: int, spe_bos: bool, spe_eos: bool, spe_pad: bool, lower_case: bool, ): """ Converts flac to wav and build manifests's json Args: text_path: source with text lines dst_folder: where wav files will be stored vocab_size: vocabular size used in encoding the text tokenizer_type: type of tokenization to perform - wpe or spe spe_type: type of tokenization model used for spe. spe_character_coverage: float value between 0 and 1 (as a percentage). For languages with a vast charset, can be < 1.0, but for all other languages, it should be set as 1.0 spe_sample_size: int, default of -1. If positive integer is used, samples the dataset by given sample size. spe_train_extremely_large_corpus: bool. If dataset is too large, and user has sufficient RAM, this flag can be set to try to trained the tokenizer. Will silently fail if it runs out of RAM. spe_max_sentencepiece_length: Limits the maximum length of the SentencePiece subword that can be constructed. By default, no limit is placed. spe_bos: Bool flag, whether to add <s> to SentencePiece tokenizer vocabulary. spe_eos: Bool flag, whether to add </s> to SentencePiece tokenizer vocabulary. spe_pad: Bool flag, whether to add <pad> to SentencePiece tokenizer vocabulary. lower_case: whether to tokenize with lower case character set only (for english) Returns: """ if tokenizer_type == 'spe': if spe_max_sentencepiece_length > 0: tokenizer_dir = os.path.join(dst_folder, 'tokenizer_{}_{}_v{}_max{}').format( tokenizer_type, spe_type, vocab_size, spe_max_sentencepiece_length) else: tokenizer_dir = os.path.join(dst_folder, 'tokenizer_{}_{}_v{}').format( tokenizer_type, spe_type, vocab_size) if not os.path.exists(tokenizer_dir): os.makedirs(tokenizer_dir) if os.path.exists(os.path.join(tokenizer_dir, 'tokenizer.model')): logging.warning( "Model file already exists, overriding old model file !") os.remove(os.path.join(tokenizer_dir, 'tokenizer.model')) tokenizer_path, vocab_path = create_spt_model( data_file=text_path, vocab_size=vocab_size, sample_size=spe_sample_size, do_lower_case=lower_case, output_dir=tokenizer_dir, tokenizer_type=spe_type, character_coverage=spe_character_coverage, train_extremely_large_corpus=spe_train_extremely_large_corpus, max_sentencepiece_length=spe_max_sentencepiece_length, bos=spe_bos, eos=spe_eos, pad=spe_pad, ) else: tokenizer_dir = os.path.join(dst_folder, 'tokenizer_{}_v{}').format( tokenizer_type, vocab_size) if not os.path.exists(tokenizer_dir): os.makedirs(tokenizer_dir) tokenizer = tokenizers.BertWordPieceTokenizer(lowercase=lower_case) tokenizer.train(text_path, vocab_size=vocab_size) tokenizer.save_model(tokenizer_dir) return tokenizer_dir
def __init__(self, df): self.df = df self.tokenizer = tokenizers.BertWordPieceTokenizer( f"/mnt/bert-base-uncased/vocab.txt", lowercase=True ) self.max_len = 128
print('[encoder] Found {} input files'.format(len(input_files))) output_prefix = 'sequences_' output_prefix += 'uppercase' if args.uppercase else 'lowercase' output_prefix += '_max_seq_len_' + str(args.max_seq_len) output_prefix += '_next_seq_task_' + str( True if args.next_seq_prob > 0 else False).lower() args.output_dir = os.path.join(args.output_dir, output_prefix) if not os.path.isdir(args.output_dir): os.makedirs(args.output_dir) if args.tokenizer == 'wordpiece': tokenizer = tokenizers.BertWordPieceTokenizer( vocab=args.vocab_file, clean_text=True, handle_chinese_chars=True, lowercase=not args.uppercase, ) elif args.tokenizer == 'bpe': tokenizer = tokenizers.ByteLevelBPETokenizer( vocab=args.vocab_file, add_prefix_space=True, lowercase=not args.uppercase, trim_offsets=True, ) #tokenizer.enable_padding(length=args.max_seq_len) #tokenizer.enable_truncation(max_length=args.max_seq_len) params = [] for i, ifile in enumerate(input_files): ofile = os.path.join(args.output_dir, 'train_{}.hdf5'.format(i))
output_string = text #output_string = post_process(output_string) output_string = output_string.strip() return output_string def filt_thresh(idx, thresh=0): idx = idx[idx > thresh] return np.nonzero(idx)[0] if len(np.nonzero(idx)[0]) > 0 else np.array([0]) BERT_TOKENIZER = tokenizers.BertWordPieceTokenizer("../input/vocab.txt", lowercase=True, add_special_tokens=False) ROBERT_TOKENIZER = tokenizers.ByteLevelBPETokenizer( vocab_file="../input/roberta-base/vocab.json", merges_file="../input/roberta-base/merges.txt", lowercase=True, add_prefix_space=True) CV_FILE = '../model/exper/cv.csv' MAX_LEN = 100 df = pd.read_csv("../input/train.csv") print(df.keys()) pred_df = pd.read_csv(CV_FILE).dropna() print(pred_df.keys())
import tokenizers MAX_LEN = 128 TRAIN_BATCH_SIZE = 32 TEST_BATCH_SIZE = 4 VALID_BATCH_SIZE = 8 EPOCHS = 10 BASE_MODEL_PATH = 'bert-base-uncased' VOCAB_PATH = '/content/vocab.txt' MODEL_PATH = 'model.bin' TOKENIZER = tokenizers.BertWordPieceTokenizer(VOCAB_PATH, lowercase=True)
import transformers import tokenizers tok = tokenizers.BertWordPieceTokenizer( '/content/drive/MyDrive/notebooks/bertNER1/vacob/bert-base-uncased.txt', lowercase=True) MAX_LEN = 128 TRAIN_BATCH_SIZE = 32 VALID_BATCH_SIZE = 8 EPOCHS = 10 EARLYSTOPPING = 6 METADATA_PATH = "D:/bertNER1/model/meta.bin" MODEL_PATH = "/content/drive/MyDrive/notebooks/bertNER1/model/model.bin" TRAINING_FILE = '/content/drive/MyDrive/notebooks/bertNER1/sample.txt' TOKENIZER = transformers.BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)