def __init__( self, vocab_file: Optional[str] = None, add_special_tokens: bool = True, unk_token: str = "[UNK]", sep_token: str = "[SEP]", cls_token: str = "[CLS]", clean_text: bool = True, handle_chinese_chars: bool = True, strip_accents: bool = True, lowercase: bool = True, wordpieces_prefix: str = "##", ): if vocab_file is not None: tokenizer = Tokenizer( WordPiece.from_files(vocab_file, unk_token=unk_token)) else: tokenizer = Tokenizer(WordPiece.empty()) tokenizer.add_special_tokens([unk_token, sep_token, cls_token]) tokenizer.normalizer = BertNormalizer( clean_text=clean_text, handle_chinese_chars=handle_chinese_chars, strip_accents=strip_accents, lowercase=lowercase, ) tokenizer.pre_tokenizer = BertPreTokenizer() if add_special_tokens and vocab_file is not None: sep_token_id = tokenizer.token_to_id(sep_token) if sep_token_id is None: raise TypeError("sep_token not found in the vocabulary") cls_token_id = tokenizer.token_to_id(cls_token) if cls_token_id is None: raise TypeError("cls_token not found in the vocabulary") tokenizer.post_processor = BertProcessing( (sep_token, sep_token_id), (cls_token, cls_token_id)) tokenizer.decoders = decoders.WordPiece(prefix=wordpieces_prefix) parameters = { "model": "BertWordPiece", "add_special_tokens": add_special_tokens, "unk_token": unk_token, "sep_token": sep_token, "cls_token": cls_token, "clean_text": clean_text, "handle_chinese_chars": handle_chinese_chars, "strip_accents": strip_accents, "lowercase": lowercase, "wordpieces_prefix": wordpieces_prefix, } super().__init__(tokenizer, parameters)
print("Running GPT-2 tokenizer") tok_p = GPT2Tokenizer.from_pretrained('gpt2') # Create a Tokenizer using BPE tok_r = Tokenizer(BPE.from_files(args.vocab, args.merges)) # Use ByteLevel PreTokenizer tok_r.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False) # Use ByteLevel Decoder tok_r.decoder = decoders.ByteLevel() elif args.type == "bert": print("Running Bert tokenizer") tok_p = BertTokenizer.from_pretrained(args.vocab) tok_r = Tokenizer( WordPiece.from_files(args.vocab, unk_token="[UNK]", max_input_chars_per_word=100)) tok_r.normalizer = BertNormalizer( clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=True, ) # tok_r.pre_tokenizer = pre_tokenizers.Whitespace() tok_r.pre_tokenizer = pre_tokenizers.BertPreTokenizer() tok_r.decoder = decoders.WordPiece() tok_r.post_processor = BertProcessing( ("[SEP]", tok_r.token_to_id("[SEP]")), ("[CLS]", tok_r.token_to_id("[CLS]")), ) else:
def preprocess_data(args): label_counter = Counter([]) examples_per_file = Counter() print("Reading all files for labels.") for input_file in args.input_files: with xopen(input_file, "rt") as f: for example, labels in input_readers[args.task](f): examples_per_file[input_file] += 1 label_counter.update(labels) if args.top_n_labels > 0: mlb_full = MultiLabelBinarizer(sparse_output=True) mlb_full = mlb_full.fit(label_counter.keys()) label_counter = dict(label_counter.most_common(args.top_n_labels)) mlb = MultiLabelBinarizer(sparse_output=True) # Passing a list in a list because that's what the function wants. mlb = mlb.fit([[pair for pair in label_counter]]) # Save list of partial -> full mapping if doing top N labels. if args.top_n_labels > 0: label_mapping = np.where(np.in1d(mlb_full.classes_, mlb.classes_))[0].tolist() with xopen(args.label_mapping, "wt") as f: f.write(json.dumps(label_mapping)) # Also save the full labels. with xopen(args.full_labels, "wt") as f: f.write(json.dumps(list(mlb_full.classes_))) # Save list of labels. with xopen(args.labels_out, "wt") as f: f.write(json.dumps(list(mlb.classes_))) # Set parallel tokenization thread count. os.environ["RAYON_NUM_THREADS"] = str(args.processes) from tokenizers import Tokenizer, decoders, trainers from tokenizers.models import WordPiece from tokenizers.normalizers import BertNormalizer from tokenizers.pre_tokenizers import BertPreTokenizer from tokenizers.processors import BertProcessing if args.task == 'cafa': # Define our custom tokenizer. # It is exactly the same as the default BERT tokenizer, except for max_input_chars_per_word # being 20000 instead of 100. This tokenizer is very slow on the long protein sequences. tokenizer = WordPiece.from_files(args.vocab, unk_token="[UNK]", max_input_chars_per_word=20000) tokenizer = Tokenizer(tokenizer) tokenizer.add_special_tokens(["[UNK]", "[SEP]", "[CLS]"]) tokenizer.normalizer = BertNormalizer(lowercase=args.do_lower_case) tokenizer.pre_tokenizer = BertPreTokenizer() tokenizer.post_processor = BertProcessing( ("[SEP]", tokenizer.token_to_id("[SEP]")), ("[CLS]", tokenizer.token_to_id("[CLS]"))) tokenizer.decoder = decoders.WordPiece(prefix='##') else: tokenizer = BertWordPieceTokenizer(args.vocab, lowercase=args.do_lower_case) tokenizer.enable_padding(max_length=args.seq_len) tokenizer.enable_truncation(max_length=args.seq_len) for input_file in args.input_files: with xopen(input_file, 'rt') as in_f: file_name = generate_out_filename(input_file, args) with xopen(file_name, "wt") as out_f: print("Processing to: ", file_name) # Write the shape as the first row, useful for the finetuning. out_f.write( json.dumps((examples_per_file[input_file], len(label_counter))) + '\n') batch_size = min(examples_per_file[input_file], args.processes * 100) example_batch = [] labels_batch = [] with ParallelGenerator(input_readers[args.task](in_f), max_lookahead=batch_size) as g: for example, labels in g: example_batch.append(example) labels_batch.append(labels) if len(example_batch) == batch_size: example_batch = tokenizer.encode_batch( example_batch) labels_batch = mlb.transform(labels_batch) for example, labels in zip(example_batch, labels_batch): # Convert sparse arrays to python lists for json dumping. # print(labels);input() labels = labels.nonzero()[1].tolist() out_f.write( json.dumps([example.ids, labels]) + '\n') example_batch = [] labels_batch = [] # Write out whatever is left in the last smaller batch. example_batch = tokenizer.encode_batch(example_batch) labels_batch = mlb.transform(labels_batch) for example, labels in zip(example_batch, labels_batch): # Convert sparse arrays to python lists for json dumping. # print(labels);input() labels = labels.nonzero()[1].tolist() out_f.write(json.dumps([example.ids, labels]) + '\n')
def __init__( self, vocab_file: Optional[str] = None, unk_token: Union[str, AddedToken] = "[UNK]", sep_token: Union[str, AddedToken] = "[SEP]", cls_token: Union[str, AddedToken] = "[CLS]", pad_token: Union[str, AddedToken] = "[PAD]", mask_token: Union[str, AddedToken] = "[MASK]", clean_text: bool = True, handle_chinese_chars: bool = True, strip_accents: bool = True, lowercase: bool = True, wordpieces_prefix: str = "##", ): if vocab_file is not None: tokenizer = Tokenizer( WordPiece.from_files(vocab_file, unk_token=str(unk_token))) else: tokenizer = Tokenizer(WordPiece.empty()) # Let the tokenizer know about special tokens if they are part of the vocab if tokenizer.token_to_id(str(unk_token)) is not None: tokenizer.add_special_tokens([str(unk_token)]) if tokenizer.token_to_id(str(sep_token)) is not None: tokenizer.add_special_tokens([str(sep_token)]) if tokenizer.token_to_id(str(cls_token)) is not None: tokenizer.add_special_tokens([str(cls_token)]) if tokenizer.token_to_id(str(pad_token)) is not None: tokenizer.add_special_tokens([str(pad_token)]) if tokenizer.token_to_id(str(mask_token)) is not None: tokenizer.add_special_tokens([str(mask_token)]) tokenizer.normalizer = BertNormalizer( clean_text=clean_text, handle_chinese_chars=handle_chinese_chars, strip_accents=strip_accents, lowercase=lowercase, ) tokenizer.pre_tokenizer = BertPreTokenizer() if vocab_file is not None: sep_token_id = tokenizer.token_to_id(str(sep_token)) if sep_token_id is None: raise TypeError("sep_token not found in the vocabulary") cls_token_id = tokenizer.token_to_id(str(cls_token)) if cls_token_id is None: raise TypeError("cls_token not found in the vocabulary") tokenizer.post_processor = BertProcessing( (str(sep_token), sep_token_id), (str(cls_token), cls_token_id)) tokenizer.decoder = decoders.WordPiece(prefix=wordpieces_prefix) parameters = { "model": "BertWordPiece", "unk_token": unk_token, "sep_token": sep_token, "cls_token": cls_token, "pad_token": pad_token, "mask_token": mask_token, "clean_text": clean_text, "handle_chinese_chars": handle_chinese_chars, "strip_accents": strip_accents, "lowercase": lowercase, "wordpieces_prefix": wordpieces_prefix, } super().__init__(tokenizer, parameters)
def test_instantiate(self, bert_files): assert isinstance(WordPiece.empty(), Model) assert isinstance(WordPiece.from_files(bert_files["vocab"]), Model)
def preprocess_data(args): label_counter = Counter([]) examples_per_file = Counter() print("Reading all files for labels.") for input_file in args.input_files: with xopen(input_file, "rt") as f: for example, labels in input_readers[args.task](f): examples_per_file[input_file] += 1 label_counter.update(labels) if args.top_n_labels > 0: mlb_full = MultiLabelBinarizer(sparse_output=True) mlb_full = mlb_full.fit(label_counter.keys()) label_counter = dict(label_counter.most_common(args.top_n_labels)) mlb = MultiLabelBinarizer(sparse_output=True) # Passing a list in a list because that's what the function wants. if args.labels_in: labels = json.load(open(args.labels_in)) mlb = mlb.fit([labels]) else: mlb = mlb.fit([[pair for pair in label_counter]]) # Save list of partial -> full mapping if doing top N labels. if args.top_n_labels > 0: label_mapping = np.where(np.in1d(mlb_full.classes_, mlb.classes_))[0].tolist() with xopen(args.label_mapping, "wt") as f: f.write(json.dumps(label_mapping)) # Also save the full labels. with xopen(args.full_labels, "wt") as f: f.write(json.dumps(list(mlb_full.classes_))) # Save list of labels. with xopen(args.labels_out, "wt") as f: f.write(json.dumps(list(mlb.classes_))) # Set parallel tokenization thread count. os.environ["RAYON_NUM_THREADS"] = str(args.processes) from tokenizers import Tokenizer, decoders, trainers from tokenizers.models import WordPiece from tokenizers.normalizers import BertNormalizer from tokenizers.pre_tokenizers import BertPreTokenizer from tokenizers.processors import BertProcessing if args.task == 'cafa': # Define our custom tokenizer. # It is exactly the same as the default BERT tokenizer, except for max_input_chars_per_word # being 20000 instead of 100. This tokenizer is very slow on the long protein sequences. tokenizer = WordPiece.from_files(args.vocab, unk_token="[UNK]", max_input_chars_per_word=20000) tokenizer = Tokenizer(tokenizer) tokenizer.add_special_tokens(["[UNK]", "[SEP]", "[CLS]"]) tokenizer.normalizer = BertNormalizer(lowercase=args.do_lower_case) tokenizer.pre_tokenizer = BertPreTokenizer() tokenizer.post_processor = BertProcessing( ("[SEP]", tokenizer.token_to_id("[SEP]")), ("[CLS]", tokenizer.token_to_id("[CLS]"))) tokenizer.decoder = decoders.WordPiece(prefix='##') else: tokenizer = BertWordPieceTokenizer(args.vocab, lowercase=args.do_lower_case) tokenizer.enable_padding(max_length=args.seq_len) tokenizer.enable_truncation(max_length=args.seq_len) for input_file in args.input_files: with xopen(input_file, 'rt') as in_f: file_name = generate_out_filename(input_file, args) with xopen(file_name, "wt") as out_f: print("Processing to: ", file_name) # Write the shape as the first row, useful for the finetuning. if args.labels_in: n_labels = len(json.load(open(args.labels_in))) else: n_labels = len(label_counter) out_f.write( json.dumps((examples_per_file[input_file], n_labels)) + '\n') batch_size = min(examples_per_file[input_file], args.processes * 100) example_batch = [] labels_batch = [] doc_idx_batch = [] with ParallelGenerator(input_readers[args.task](in_f), max_lookahead=batch_size) as g: START_POS = int(args.window_start) / 100 for doc_idx, (example, labels) in enumerate(g): #example = ' '.join(example.split(' ')[-510:]) example_batch.append(example) labels_batch.append(labels) doc_idx_batch.append(doc_idx) if len(example_batch) == batch_size: example_batch = tokenizer.encode_batch( example_batch) labels_batch = mlb.transform(labels_batch) for example, labels, doc_idx in zip( example_batch, labels_batch, doc_idx_batch): # Convert sparse arrays to python lists for json dumping. # print(labels);input() labels = labels.nonzero()[1].tolist() """try: [][0] print("DOC_LEN:",len(example.overflowing)+1) mid = len(example.overflowing)//2 out_f.write(json.dumps( [example.overflowing[mid].ids, labels, len(example.overflowing)+1] ) + '\n') except IndexError: out_f.write(json.dumps( [example.ids, labels, len(example.overflowing)+1] ) + '\n')""" if args.all_blocks or args.n_blocks > 0: blocks = [example.ids] + [ blk.ids for blk in example.overflowing ] #print("BLOCKS:%d,TOKENS:%d" % (len(list(blocks)), sum([len(list(tokens)) for tokens in blocks]))) for b, block in enumerate(blocks, 2): if b > args.n_blocks and args.n_blocks > 0: break out_f.write( json.dumps( [block, labels, doc_idx]) + '\n') else: window = get_window(example, START_POS) assert len(window) == 512 assert all( [type(y) is int for y in window]) out_f.write( json.dumps([window, labels]) + '\n') example_batch = [] labels_batch = [] # Write out whatever is left in the last smaller batch. example_batch = tokenizer.encode_batch(example_batch) labels_batch = mlb.transform(labels_batch) for example, labels, doc_idx in zip( example_batch, labels_batch, doc_idx_batch): # Convert sparse arrays to python lists for json dumping. # print(labels);input() labels = labels.nonzero()[1].tolist() """try: [][0] print("DOC_LEN:",len(example.overflowing)+1) mid = len(example.overflowing)//2 out_f.write(json.dumps( [example.overflowing[mid].ids, labels, len(example.overflowing)+1] ) + '\n') except IndexError: out_f.write(json.dumps( [example.ids, labels, len(example.overflowing)+1] ) + '\n')""" if args.all_blocks or args.n_blocks > 0: blocks = [example.ids] + [ blk.ids for blk in example.overflowing ] #print("BLOCKS:%d,TOKENS:%d" % (len(list(blocks)), sum([len(list(tokens)) for tokens in blocks]))) for b, block in enumerate(blocks, 2): if b > args.n_blocks and args.n_blocks > 0: break out_f.write( json.dumps([block, labels, doc_idx]) + '\n') else: out_f.write( json.dumps( [get_window(example, START_POS), labels]) + '\n')