def init_tgt(params): """ Initialize the parameters of the target model """ prob = None if params.prob: print(' | load word translation probs!') prob = torch.load(params.prob) print(f'| load English pre-trained model: {params.src_model}') config = AutoConfig.from_pretrained(params.src_model, cache_dir=params.cache_dir) model = AutoModelForMaskedLM.from_pretrained( params.src_model, from_tf=bool(".ckpt" in params.src_model), config=config, cache_dir=params.cache_dir, ) if 'roberta' in params.src_model: assert params.src_merge, "merge file should be provided!" src_tokenizer = RobertaTokenizer(params.src_vocab, params.src_merge) else: # note that we do not lowercase here src_tokenizer = AutoTokenizer.from_pretrained( params.src_model, cache_dir='/home/georgios.vernikos/workspace/LMMT/MonoEgo/cache', use_fast=True) # get English word-embeddings and bias src_embs = model.base_model.embeddings.word_embeddings.weight.detach( ).clone() src_bias = model.cls.predictions.bias.detach().clone() # initialize target tokenizer, we always use BertWordPieceTokenizer for the target language tgt_tokenizer = BertTokenizerFast(vocab_file=params.tgt_vocab, do_lower_case=True, strip_accents=False) tgt_embs, tgt_bias = guess(src_embs, src_bias, tgt_tokenizer, src_tokenizer, prob=prob) # checksum for debugging purpose print(' checksum src | embeddings {:.5f} - bias {:.5f}'.format( src_embs.norm().item(), src_bias.norm().item())) model.base_model.embeddings.word_embeddings.weight.data = tgt_embs model.cls.predictions.bias.data = tgt_bias model.tie_weights() print(' checksum tgt | embeddings {:.5f} - bias {:.5f}'.format( model.base_model.embeddings.word_embeddings.weight.norm().item(), model.cls.predictions.bias.norm().item())) # save the model model_to_save = (model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(params.tgt_model)
def __init__(self, model_dir): self.model = RobertaForSequenceClassification( model_dir, output_attentions=True, output_hidden_states=True) self.tokenizer = RobertaTokenizer( model_dir, add_special_tokens=True, merges_file=os.path.join(model_dir, "merges.txt"), )
class BaseDataset(ABC, IterableDataset, metaclass=DatasetRegistry): task = None tokenizer = RobertaTokenizer(vocab_file='vocabs/roberta-vocab.json', merges_file='vocabs/roberta-merges.txt') pad_token_id = tokenizer.convert_tokens_to_ids(['<pad>'])[0] mask_token_id = tokenizer.convert_tokens_to_ids(['<mask>'])[0] gap_token_id = tokenizer.convert_tokens_to_ids(['<gap>'])[0] def __init__(self, data_file, data_size, local_rank, world_size=None): if not os.path.isfile(data_file): raise FileNotFoundError(f'{data_file} does not exist or is a directory.') self.data_file = data_file self.size = data_size if local_rank == -1: self.start = 0 self.step = 1 else: self.start = local_rank self.step = world_size def __len__(self): return self.size def __iter__(self): file_iter = gzip.open(self.data_file, 'rt') islice_iter = islice(file_iter, self.start, None, self.step) processed_iter = map(self.process_line, islice_iter) return processed_iter @staticmethod def tokenize_first_segment(segment): return ['<s>'] + segment.split() + ['</s>'] @staticmethod def tokenize_second_segment(segment): return ['</s>'] + segment.split() + ['</s>'] @staticmethod def pad_2d(array_2d, pad_value=0): row_lengths = [len(row) for row in array_2d] max_len = max(row_lengths) for i in range(len(array_2d)): array_2d[i] += [pad_value for _ in range(max_len - row_lengths[i])] return array_2d @abstractmethod def process_line(self, line): pass @classmethod @abstractmethod def collate_fn(cls, batch): pass
def __init__(self): super(ReviewModel, self).__init__() tokenizer = RobertaTokenizer( vocab_file = Constants.VOCAB_FILE, merges_file = Constants.MERGES_FILE, add_prefix_space = True ) config = RobertaConfig(output_hidden_states = True) self.backbone = RobertaModel(config) self.backbone.resize_token_embeddings(len(tokenizer)) self.fc = nn.Linear(in_features = config.hidden_size, out_features = 1, bias = True)
def init_tgt(params): """ Initialize the parameters of the target model """ prob = None if params.prob: print(' | load word translation probs!') prob = torch.load(params.prob) print(f'| load English pre-trained model: {params.src_model}') model = torch.load(params.src_model) if 'roberta' in params.src_model: assert params.src_merge, "merge file should be provided!" src_tokenizer = RobertaTokenizer(params.src_vocab, params.src_merge) else: # note that we do not lowercase here src_tokenizer = BertTokenizer(params.src_vocab, do_lower_case=False) # get English word-embeddings and bias src_embs = model[MAP['word_embeddings']] src_bias = model[MAP['output_bias']] # initialize target tokenizer, we always use BertWordPieceTokenizer for the target language tgt_tokenizer = BertWordPieceTokenizer(params.tgt_vocab, unk_token=UNK_TOKEN, sep_token=SEP_TOKEN, cls_token=CLS_TOKEN, pad_token=PAD_TOKEN, mask_token=MASK_TOKEN, lowercase=False, strip_accents=False) tgt_embs, tgt_bias = guess(src_embs, src_bias, tgt_tokenizer, src_tokenizer, prob=prob) # checksum for debugging purpose print(' checksum src | embeddings {:.5f} - bias {:.5f}'.format( src_embs.norm().item(), src_bias.norm().item())) model[MAP['word_embeddings']] = tgt_embs model[MAP['output_bias']] = tgt_bias model[MAP['output_weight']] = model[MAP['word_embeddings']] print(' checksum tgt | embeddings {:.5f} - bias {:.5f}'.format( model[MAP['word_embeddings']].norm().item(), model[MAP['output_bias']].norm().item())) # save the model torch.save(model, params.tgt_model)
def _roberta(self, text, unit="text"): """ ex) """ if self.bpe_tokenizer is None: vocab_path = self.data_handler.read(self.config["vocab_path"], return_path=True) merges_path = self.data_handler.read(self.config["merges_path"], return_path=True) del self.config["vocab_path"] del self.config["merges_path"] self.bpe_tokenizer = RobertaTokenizer(vocab_path, merges_path, **self.config) return self.bpe_tokenizer._tokenize(text)
def make_xnli_data(params): """ read text file and tensorize text file input has the format "premise hypothesis label" """ if params.merge: tokenizer = RobertaTokenizer(params.vocab, params.merge) else: tokenizer = BertTokenizer(params.vocab, do_lower_case=False) xs, ys = [], [] labels = {'contradiction': 0, 'neutral': 1, 'entailment': 2} pad_index = tokenizer.pad_token_id with open(params.input, 'r') as f: next(f) for i, line in enumerate(f): cols = line.rstrip().split('\t') if len(cols) != 3: print(f"potential error at line {i}") enc1 = tokenizer.encode(cols[0]) enc2 = tokenizer.encode(cols[1]) xs.append(enc1 + enc2[1:-2]) # [CLS] p1 ... [SEP] h1 ...hn ys.append(labels[cols[2]]) # convert data to tensor xs = [torch.LongTensor(s) for s in xs] xs = pad_sequence(xs, batch_first=True, padding_value=pad_index) ys = torch.LongTensor(ys) unk_index = tokenizer.unk_token_id n_unks = sum([(s == unk_index).sum().item() for s in xs]) n_toks = sum([len(s) for s in xs]) p_unks = n_unks * 100. / n_toks print(f"{n_toks} tokens - {p_unks:.2f}% unknown words") data = {'xs': xs, 'ys': ys, 'pad_index': pad_index} torch.save(data, params.output)
help='Number of processes to use for data preprocessing.') parser.add_argument('--data_dir', type=str, default='wikipedia/extracted', help='Directory with data to preprocess.') parser.add_argument('--save_dir', type=str, default='GT/text', help='Directory for saving preprocessed data.') parser.add_argument('--seed', type=int, default=111, help='Random seed.') args = parser.parse_args() nlp = spacy.load('en_core_web_sm', disable=['tagger', 'ner']) tokenizer = RobertaTokenizer(vocab_file='../vocabs/roberta-vocab.json', merges_file='../vocabs/roberta-merges.txt', additional_special_tokens=['<gap>']) GAP_TOKEN = '<gap>' UNK_TOKEN = '<unk>' MAX_PAIR_LENGTH = 508 LOWER = False # Sentences that are too long will be split on these tokens split_tokens_1 = { '.', '?', '!', ',', ':', ';', 'that', 'which', 'who', 'whom', 'whose', 'when', 'where', 'of', 'for', 'from', 'was', 'is', 'are', 'were', 'and', 'or', 'but', 'if', 'whether', 'while', 'because', 'though', 'as', 'to' } split_tokens_2 = { 'what', 'instead', 'have', 'has', 'had', 'will', 'there', 'those', 'this',
tags_vals.append('X') tags_vals.append('[CLS]') tags_vals.append('[SEP]') tags_vals = set(tags_vals) tag2idx = {t: i for i, t in enumerate(tags_vals)} tag2name = {tag2idx[key]: key for key in tag2idx.keys()} device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cude.device_count() #torch.cuda.device_count() tok_dir = "/GW/Health-Corpus/work/roberta-finetuning-ner/roberta-tokenizer/roberta-base-" tokenizer = RobertaTokenizer(tok_dir + "vocab.json", tok_dir + "merges.txt", do_lower_case=False) # %% tokenized_texts = [] word_piece_labels = [] i_inc = 0 for word_list, label in (zip(sentences, labels)): temp_lable = [] temp_token = [] # Add [CLS] at the front temp_lable.append('[CLS]') temp_token.append('[CLS]') for word, lab in zip(word_list, label):