def main(): args = parse_arguments() words = collect_words(args.input_dir) mlbt = BertTokenizer(args.model_dir, do_lower_case=False) hubt = BertTokenizer(args.vocab_file, do_lower_case=False) mlstats = count_wordpieces(words, mlbt) hustats = count_wordpieces(words, hubt) print(f'Multilingual: {mlstats}') print(f'Hungarian: {hustats}')
def test_data_collator_for_language_modeling(self): tokenizer = BertTokenizer(self.vocab_file) no_pad_features = [{ "input_ids": list(range(10)) }, { "input_ids": list(range(10)) }] pad_features = [{ "input_ids": list(range(5)) }, { "input_ids": list(range(10)) }] data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False) batch = data_collator(no_pad_features) self.assertEqual(batch["input_ids"].shape, torch.Size((2, 10))) self.assertEqual(batch["labels"].shape, torch.Size((2, 10))) batch = data_collator(pad_features) self.assertEqual(batch["input_ids"].shape, torch.Size((2, 10))) self.assertEqual(batch["labels"].shape, torch.Size((2, 10))) tokenizer._pad_token = None data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False) with self.assertRaises(ValueError): # Expect error due to padding token missing data_collator(pad_features) set_seed(42) # For reproducibility tokenizer = BertTokenizer(self.vocab_file) data_collator = DataCollatorForLanguageModeling(tokenizer) batch = data_collator(no_pad_features) self.assertEqual(batch["input_ids"].shape, torch.Size((2, 10))) self.assertEqual(batch["labels"].shape, torch.Size((2, 10))) masked_tokens = batch["input_ids"] == tokenizer.mask_token_id self.assertTrue(torch.any(masked_tokens)) self.assertTrue( all(x == -100 for x in batch["labels"][~masked_tokens].tolist())) batch = data_collator(pad_features) self.assertEqual(batch["input_ids"].shape, torch.Size((2, 10))) self.assertEqual(batch["labels"].shape, torch.Size((2, 10))) masked_tokens = batch["input_ids"] == tokenizer.mask_token_id self.assertTrue(torch.any(masked_tokens)) self.assertTrue( all(x == -100 for x in batch["labels"][~masked_tokens].tolist()))
def collate_fn(batch_data): tokenizer = BertTokenizer('./data/bert/nezha-base-www/vocab.txt') max_len = max([len(x[0]) for x in batch_data]) + 2 input_ids, token_type_ids, attention_mask, labels = [], [], [], [] for text, label in batch_data: inputs = tokenizer.encode_plus(text=text, max_length=max_len, pad_to_max_length=True, is_pretokenized=True, return_token_type_ids=True, return_attention_mask=True, truncation=True) label = tokenizer.encode_plus(text=label, max_length=max_len, pad_to_max_length=True, is_pretokenized=True, return_token_type_ids=False, return_attention_mask=False, truncation=True) input_ids.append(inputs['input_ids']) token_type_ids.append(inputs['token_type_ids']) attention_mask.append(inputs['attention_mask']) labels.append(label['input_ids']) input_ids = torch.tensor(input_ids).long() token_type_ids = torch.tensor(token_type_ids).long() attention_mask = torch.tensor(attention_mask).float() labels = torch.tensor(labels).long() return input_ids, token_type_ids, attention_mask, labels
def main(): config = get_config() args = train_args.setup_train_args() if args.seed: train_args.set_random_seed(args) # 初始化tokenizer tokenizer = BertTokenizer(vocab_file=args.vocab_path) # tokenizer的字典大小 global pad_id # pad_id = tokenizer.convert_tokens_to_ids(PAD) # 创建对话模型的输出目录 if not os.path.exists(args.dialogue_model_output_path): os.mkdir(args.dialogue_model_output_path) # 加载GPT2模型 model, n_ctx, optimizer = create_model(args, config) # 对原始数据进行预处理,将原始语料转换成对应的token_id # 如果当前是要训练对话生成模型 print('开始产生token') # 不修改数据集的情况下,没必要每次训练都运行preprocess_raw_data 因为 生成的data是一样的 if not os.path.exists(args.train_tokenized_path): file = open(args.train_tokenized_path, 'w') preprocess_data.preprocess_raw_data(args, tokenizer, n_ctx) train_loss = tf.keras.metrics.Mean(name='train_loss') train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy( name='train_accuracy') print('开始训练') train(model, args, tokenizer, optimizer, train_loss, train_accuracy) print('训练结束')
def createVocabulary(reciepts): vocab = set() for reciept in reciepts: words = reciept.dataWords for word in words: vocab.add(word) path = './data/prod_vocab.txt' with open(path, 'r') as f: for line in f: vocab.add(line[:-1]) tokenizer=BertTokenizer(vocab_file=path,do_lower_case=False) new_set = set() for word in vocab: token_list = tokenizer.tokenize(word) if '[UNK]' in token_list: print(word) t = re.split(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./<>?]', word) for i, v in enumerate(token_list): if v == '[UNK]' and i < len(t): for x in t: new_set.add(x) with open('./data/prod_vocab.txt', 'w+') as f: for word in (vocab.union(new_set)): f.write(word + '\n') return vocab
def main(): ## 外部參數設定 parser = argparse.ArgumentParser() parser.add_argument('--train_file_path', '-trfp', type=str) parser.add_argument('--test_file_path', '-tefp', type=str) parser.add_argument('--valid_file_path', '-vafp', type=str) parser.add_argument('--output_file_path', '-outfp', type=str) args = parser.parse_args() ## 預設值 tokenizer = BertTokenizer(vocab_file='bert-base-uncased-vocab.txt') nlp = spacy.load( "model/spacy/en_core_web_md-2.3.1/en_core_web_md/en_core_web_md-2.3.1") accepted_pos_list = get_accepted_pos_list() ## ## for debug # args.train_file_path="/user_data/Project/Controllable_Syntax_with_BERT/dataset/train_10.txt" # args.test_file_path="/user_data/Project/Controllable_Syntax_with_BERT/dataset/test_5.txt" # args.valid_file_path="/user_data/Project/Controllable_Syntax_with_BERT/dataset/validation_5.txt" # args.output_file_path="/user_data/Project/Controllable_Syntax_with_BERT/sequential_dataset" ## data_path_dict = { "train": args.train_file_path, "test": args.test_file_path, "validation": args.valid_file_path } for key, data_path in data_path_dict.items(): print("get {0} data ...".format(key)) semantic_list, syntactic_list = get_dataset_list(data_path) print(" get {0} all syntactic keyword list ...".format(key)) all_syntactic_keyword_list = get_all_syntactic_keyword_list( syntactic_list, accepted_pos_list, tokenizer, nlp) print(" insert sep to {0} all syntactic keyword list ...".format(key)) all_syntactic_keyword_with_sep_list = insert_sep_token( all_syntactic_keyword_list) print(" get {0} all sequence sentence list ...".format(key)) all_sequence_sentence_list = get_all_sequence_sentence_list( syntactic_list, tokenizer) print(" get {0} embeddings ...".format(key)) token_embedding_id_list, segment_embedding_list, attention_embedding_list, maskLM_embedding_list = get_embeddings( semantic_list, syntactic_list, all_syntactic_keyword_with_sep_list, all_sequence_sentence_list, tokenizer) print(" convert to feature {0} embeddings ...".format(key)) convert_embedding_to_feature(args.output_file_path, key, token_embedding_id_list, segment_embedding_list, attention_embedding_list, maskLM_embedding_list) print(" {0} data finished".format(key)) return 0
def build_model(self): """创建GPT-2生成模型 """ # 使用bert tokenizer # 初始化tokenizer self.tokenizer = BertTokenizer(vocab_file=self.args.vocab_path) # temp = self.tokenizer.convert_tokens_to_ids('') # print(self.tokenizer.convert_ids_to_tokens(temp)) # tokenizer的字典大小 self.vocab_size = len(self.tokenizer) self.pad_id = self.tokenizer.convert_tokens_to_ids(PAD) if self.args.pretrained_model: # 如果指定了预训练的GPT2模型 model = GPT2LMHeadModel.from_pretrained(self.args.pretrained_model) else: # 若没有指定预训练模型,则初始化模型 model_config = GPT2Config(self.args.model_config) model = GPT2LMHeadModel(config=model_config) # 根据tokenizer的vocabulary调整GPT2模型的voca的大小 model.resize_token_embeddings(self.vocab_size) print('model config:\n{}'.format(model.config.to_json_string())) return model, model.config.to_dict().get("n_ctx")
def preprocess(self, data): """ Receives text in form of json and converts it into an encoding for the inference stage :param data: Input to be passed through the layers for prediction :return: output - preprocessed encoding """ text = data[0].get("data") if text is None: text = data[0].get("body") text = text.decode("utf-8") tokenizer = BertTokenizer( self.VOCAB_FILE) # .from_pretrained("bert-base-cased") encoding = tokenizer.encode_plus( text, max_length=32, add_special_tokens=True, # Add '[CLS]' and '[SEP]' return_token_type_ids=False, padding="max_length", return_attention_mask=True, return_tensors="pt", # Return PyTorch tensors truncation=True, ) return encoding
def chat(folder_bert, voc, testing=False): tf.random.set_seed(1) tokenizer = BertTokenizer(vocab_file=folder_bert + voc) if testing: tokens = tokenizer.tokenize("jeg tror det skal regne") print(tokens) ids = tokenizer.convert_tokens_to_ids(tokens) print(ids) print("Vocab size:", len(tokenizer.vocab)) config = BertConfig.from_json_file(folder_bert + "/config.json") model = BertLMHeadModel.from_pretrained(folder_bert, config=config) while (1): text = input(">>User: "******"Bot: {}".format(tokenizer.decode(sample_output[0]))) print("Bot: {}".format( tokenizer.decode(sample_output[:, input_ids.shape[-1]:][0], skip_special_tokens=True)))
def __init__(self, pretrained_model: str, use_starting_offsets: bool = False, do_lowercase: bool = True, never_lowercase: List[str] = None, max_pieces: int = 512, truncate_long_sequences: bool = True) -> None: if pretrained_model.endswith("-cased") and do_lowercase: logger.warning("Your BERT model appears to be cased, " "but your indexer is lowercasing tokens.") elif pretrained_model.endswith("-uncased") and not do_lowercase: logger.warning("Your BERT model appears to be uncased, " "but your indexer is not lowercasing tokens.") if os.path.isdir(pretrained_model): pretrained_model = os.path.join(pretrained_model, 'vocab.txt') bert_tokenizer = BertTokenizer(pretrained_model, do_lower_case=do_lowercase) super().__init__( vocab=bert_tokenizer.vocab, wordpiece_tokenizer=bert_tokenizer.wordpiece_tokenizer.tokenize, namespace="bert", use_starting_offsets=use_starting_offsets, max_pieces=max_pieces, do_lowercase=do_lowercase, never_lowercase=never_lowercase, start_tokens=["[CLS]"], end_tokens=["[SEP]"], separator_token="[SEP]", truncate_long_sequences=truncate_long_sequences)
def test_sop(self): tokenizer = BertTokenizer(self.vocab_file) features = [{ "input_ids": torch.tensor([0, 1, 2, 3, 4]), "token_type_ids": torch.tensor([0, 1, 2, 3, 4]), "sentence_order_label": i, } for i in range(2)] data_collator = DataCollatorForLanguageModeling(tokenizer) batch = data_collator(features) self.assertEqual(batch["input_ids"].shape, torch.Size((2, 5))) self.assertEqual(batch["token_type_ids"].shape, torch.Size((2, 5))) self.assertEqual(batch["labels"].shape, torch.Size((2, 5))) self.assertEqual(batch["sentence_order_label"].shape, torch.Size( (2, ))) data_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8) batch = data_collator(features) self.assertEqual(batch["input_ids"].shape, torch.Size((2, 8))) self.assertEqual(batch["token_type_ids"].shape, torch.Size((2, 8))) self.assertEqual(batch["labels"].shape, torch.Size((2, 8))) self.assertEqual(batch["sentence_order_label"].shape, torch.Size( (2, )))
def test_data_collator_with_padding(self): tokenizer = BertTokenizer(self.vocab_file) features = [{ "input_ids": [0, 1, 2] }, { "input_ids": [0, 1, 2, 3, 4, 5] }] data_collator = DataCollatorWithPadding(tokenizer, return_tensors="np") batch = data_collator(features) self.assertEqual(batch["input_ids"].shape, (2, 6)) self.assertEqual(batch["input_ids"][0].tolist(), [0, 1, 2] + [tokenizer.pad_token_id] * 3) data_collator = DataCollatorWithPadding(tokenizer, padding="max_length", max_length=10, return_tensors="np") batch = data_collator(features) self.assertEqual(batch["input_ids"].shape, (2, 10)) data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8, return_tensors="np") batch = data_collator(features) self.assertEqual(batch["input_ids"].shape, (2, 8))
def test_sop(self): tokenizer = BertTokenizer(self.vocab_file) features = [{ "input_ids": tf.convert_to_tensor([0, 1, 2, 3, 4]), "token_type_ids": tf.convert_to_tensor([0, 1, 2, 3, 4]), "sentence_order_label": i, } for i in range(2)] data_collator = DataCollatorForLanguageModeling(tokenizer, return_tensors="tf") batch = data_collator(features) self.assertEqual(batch["input_ids"].shape.as_list(), [2, 5]) self.assertEqual(batch["token_type_ids"].shape.as_list(), [2, 5]) self.assertEqual(batch["labels"].shape.as_list(), [2, 5]) self.assertEqual(batch["sentence_order_label"].shape.as_list(), [2]) data_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8, return_tensors="tf") batch = data_collator(features) self.assertEqual(batch["input_ids"].shape.as_list(), [2, 8]) self.assertEqual(batch["token_type_ids"].shape.as_list(), [2, 8]) self.assertEqual(batch["labels"].shape.as_list(), [2, 8]) self.assertEqual(batch["sentence_order_label"].shape.as_list(), [2])
def test_nsp(self): tokenizer = BertTokenizer(self.vocab_file) features = [{ "input_ids": [0, 1, 2, 3, 4], "token_type_ids": [0, 1, 2, 3, 4], "next_sentence_label": i } for i in range(2)] data_collator = DataCollatorForLanguageModeling(tokenizer, return_tensors="np") batch = data_collator(features) self.assertEqual(batch["input_ids"].shape, (2, 5)) self.assertEqual(batch["token_type_ids"].shape, (2, 5)) self.assertEqual(batch["labels"].shape, (2, 5)) self.assertEqual(batch["next_sentence_label"].shape, (2, )) data_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8, return_tensors="np") batch = data_collator(features) self.assertEqual(batch["input_ids"].shape, (2, 8)) self.assertEqual(batch["token_type_ids"].shape, (2, 8)) self.assertEqual(batch["labels"].shape, (2, 8)) self.assertEqual(batch["next_sentence_label"].shape, (2, ))
def convert_data_to_context(filepath, dataset): DRCD = LoadJson(filepath) tokenizer = BertTokenizer(vocab_file='bert-base-uncased-vocab.txt') # context_tokens = [] # context_loss_tokens = [] sample = [] keyword_tokens = [] # BertForMaskedLM for data in DRCD["data"]: for paragraph in data["paragraphs"]: context = paragraph["context"] little_context = context[:128] sample.append(little_context) index = round(len(sample) * 0.25) if dataset == "test1": small_sample = sample[:index] elif dataset == "test2": small_sample = sample[index:index * 2] elif dataset == "test3": small_sample = sample[index * 2:index * 3] else: small_sample = sample[index * 3:] for c in small_sample: # c_c = conversion_context(c, tokenizer, context_loss_tokens) k = context_keyword(c) # context_tokens.append(c_c) keyword_tokens.append(k[0]) # return context_tokens return keyword_tokens
def explain_handle(self, model_wraper, text, target=1): # pylint: disable=too-many-locals,unused-argument,arguments-differ """Captum explanations handler. Args: data_preprocess (Torch Tensor): Preprocessed data to be used for captum raw_data (list): The unprocessed data to get target from the request Returns: dict : A dictionary response with the explanations response. """ model_wrapper = AGNewsmodelWrapper(self.model) tokenizer = BertTokenizer(self.vocab_file) model_wrapper.eval() model_wrapper.zero_grad() input_ids = torch.tensor( [tokenizer.encode(self.text, add_special_tokens=True)]) input_embedding_test = model_wrapper.model.bert_model.embeddings( input_ids) preds = model_wrapper(input_embedding_test) out = np.argmax(preds.cpu().detach(), axis=1) out = out.item() ig_1 = IntegratedGradients(model_wrapper) attributions, delta = ig_1.attribute( # pylint: disable=no-member input_embedding_test, n_steps=500, return_convergence_delta=True, target=1, ) tokens = tokenizer.convert_ids_to_tokens(input_ids[0].numpy().tolist()) feature_imp_dict = {} feature_imp_dict["words"] = tokens attributions_sum = self.summarize_attributions(attributions) feature_imp_dict["importances"] = attributions_sum.tolist() feature_imp_dict["delta"] = delta[0].tolist() return [feature_imp_dict]
def featurize(self, df): bert_model = BertModel.from_pretrained(self.data_path) bert_tokenizer = BertTokenizer(self.data_path + "/vocab.txt", do_lower_case=False, do_basic_tokenize=False) mecab = MeCab.Tagger('-Ochasen') data_list = df.rdd.collect() label_list = [] vec_list = [] for data in data_list: tmp_list = [] node_list = data[1] for word in node_list: tmp_list.append(word) if len(tmp_list) != 0: label_list.append(float(data[0])) bert_tokens = bert_tokenizer.tokenize( " ".join(["[CLS]"] + tmp_list + ["[SEP]"])) token_ids = bert_tokenizer.convert_tokens_to_ids(bert_tokens) tokens_tensor = torch.tensor(token_ids).unsqueeze(0) all_outputs = bert_model(tokens_tensor) embedding = all_outputs[-2].detach().numpy()[0] vec = np.mean(embedding, axis=0).tolist() vec_list.append(Vectors.dense(vec)) zip_list = zip(label_list, vec_list) new_df = self.spark.createDataFrame(zip_list, ("label", "features")) return new_df
def tokenize_and_pad_samples(genes, labels): k = len(genes[0][0]) if k == 4: kmer_filepath = '/home/brian/Downloads/fourmers.txt' elif k == 6: kmer_filepath = '/home/brian/Downloads/hexamers.txt' elif k == 8: kmer_filepath = '/home/brian/Downloads/octamers.txt' formatted_samples = [['[CLS]'] + sample + ['[SEP]'] for sample in genes] formatted_labels = [[0] + l + [0] for l in labels] tokenizer = BertTokenizer(kmer_filepath, max_len=MAX_LEN) print("TOKENIZER LENGTH", len(tokenizer)) attention_masks = [ np.concatenate([np.ones(len(l)), np.zeros(MAX_LEN - len(l))]) for l in formatted_labels ] #seq_ids = tokenizer.convert_tokens_to_ids(formatted_samples) seq_ids = [ tokenizer.convert_tokens_to_ids(sample) for sample in formatted_samples ] seq_ids = pad_sequences(seq_ids, maxlen=MAX_LEN, truncating='post', padding='post') return seq_ids, attention_masks, formatted_labels
def __init__(self, context: PyTorchTrialContext) -> None: # Read configuration self.context = context self.data_config = self.context.get_data_config() # Create Tensorboard logger self.logger = TorchWriter() # Create tokinizer based on the predefient vocabulary self.tokenizer = BertTokenizer(self.data_config["voc_path"], do_lower_case=False) # Label Encoder if self.context.get_hparam("reduce_to_binary_problem"): class_num = 2 else: class_num = 6 # Initialize model and wrap it in the determined api model = ProtTransClassification(self.data_config["pretrained_path"], class_num=class_num, classification_feature=self.context.get_hparam("classification_feature"), dropout=self.context.get_hparam("classification_dropout"), freeze_bert=self.context.get_hparam("bert_freeze")) optimizer = Lamb([{"params": model.wordencoding.parameters(), "lr": self.context.get_hparam("bert_lr")}, {"params": model.classification.parameters()}], lr=self.context.get_hparam("classification_lr")) self.model = self.context.wrap_model(model) self.optimizer = self.context.wrap_optimizer(optimizer)
def __init__(self, squad_model_config: str, vocab_file: str, do_lower_case: bool, max_seq_length: int = 512, batch_size: int = 10, lang: str = 'en', **kwargs) -> None: config = json.load(open(squad_model_config)) config['chainer']['pipe'][0]['max_seq_length'] = max_seq_length self.model = build_model(config) self.max_seq_length = max_seq_length if Path(vocab_file).is_file(): vocab_file = str(expand_path(vocab_file)) self.tokenizer = BertTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) else: self.tokenizer = BertTokenizer.from_pretrained( vocab_file, do_lower_case=do_lower_case) self.batch_size = batch_size if lang == 'en': from nltk import sent_tokenize self.sent_tokenizer = sent_tokenize elif lang == 'ru': from ru_sent_tokenize import ru_sent_tokenize self.sent_tokenizer = ru_sent_tokenize else: raise RuntimeError('en and ru languages are supported only')
def main(): parser = argparse.ArgumentParser() parser.add_argument('--train_file_path', '-trfp', type=str) parser.add_argument('--test_file_path', '-tefp', type=str) args = parser.parse_args() ## for debug if args.train_file_path == None: args.train_file_path = "dataset/mingda_train_10.txt" if args.test_file_path == None: args.test_file_path = "dataset/mingda_test_5.txt" ## ## pre-load tokenizer = BertTokenizer(vocab_file='bert-base-uncased-vocab.txt') nlp = spacy.load( "model/spacy/en_core_web_md-2.3.1/en_core_web_md/en_core_web_md-2.3.1") accepted_pos_list = get_accepted_pos_list() ## data_path_dict = { "train": args.train_file_path, "test": args.test_file_path } for key, data_path in data_path_dict.items(): print("get {0} data ...".format(key)) semantics_list, syntactic_list = get_dataset_list(data_path) print("get {0} all syntactic keyword list ...".format(key)) extrapolate_sequential_syntactic(syntactic_list, accepted_pos_list, tokenizer, nlp) print("")
def bert_pretraining(dataset, config): bert_tokenizer = BertTokenizer('./bert-base-chinese' + '/vocab.txt') model = BertModel.from_pretrained('./bert-base-chinese') model.eval() model.to(config.device) for batch in batch_slice(dataset, config.train_batch_size): tokens_tensor = [] for instance in batch: instance.ids = bert_tokenizer.convert_tokens_to_ids(instance.chars) tokens_tensor.append(torch.tensor(instance.ids)) tokens_tensor = pad_sequence(tokens_tensor).T attention_mask = torch.ne(tokens_tensor, torch.zeros_like(tokens_tensor)) tokens_tensor = tokens_tensor.to(config.device) attention_mask = attention_mask.to(config.device) with torch.no_grad(): outputs = model(tokens_tensor, attention_mask=attention_mask) encoded_layers = outputs[0] for index, instance in enumerate(batch): instance.embeddings = encoded_layers[ index, 0:len(instance.ids), :].cpu().numpy()
def __init__(self, model_file=DEFAULT_MODEL_URL, name="Dialog"): super(Dialog, self).__init__(name=name) if not os.path.exists(os.path.join(DEFAULT_DIRECTORY,'multiwoz/data')): os.mkdir(os.path.join(DEFAULT_DIRECTORY,'multiwoz/data')) ### download multiwoz data print('down load data from', DEFAULT_ARCHIVE_FILE_URL) if not os.path.exists(os.path.join(DEFAULT_DIRECTORY,'multiwoz/save')): os.mkdir(os.path.join(DEFAULT_DIRECTORY,'multiwoz/save')) ### download trained model print('down load model from', DEFAULT_MODEL_URL) model_path = "" config = Config() parser = config.parser config = parser.parse_args() with open("assets/never_split.txt") as f: never_split = f.read().split("\n") self.tokenizer = BertTokenizer("assets/vocab.txt", never_split=never_split) self.nlu = BERTNLU() self.dst_ = DST(config).cuda() ckpt = torch.load("save/model_Sun_Jun_21_07:08:48_2020.pt", map_location = lambda storage, loc: storage.cuda(local_rank)) self.dst_.load_state_dict(ckpt["model"]) self.dst_.eval() self.policy = RulePolicy() self.nlg = TemplateNLG(is_user=False) self.init_session() self.slot_mapping = { "leave": "leaveAt", "arrive": "arriveBy" }
def create_Bert_tokenizer(use_pretrained=True, **kwargs): if use_pretrained: if 'model' not in kwargs: raise ValueError("Need type to sign the pretrained model") Path = ModelConfig.Bert_Pretrained_Model_Map.get(kwargs['model'], None) tokenizer_path = os.path.join(Path, 'vocab.txt') if Path == None: raise ValueError("Need choice model again") Tokenizer = BertTokenizer(tokenizer_path) else: if 'vocab_file' not in kwargs: raise ValueError("Please input vocab file path") path = kwargs.get('vocab_file') Tokenizer = BertTokenizer(path) return Tokenizer
def build_data(self): self.tokenizer = BertTokenizer(vocab_file=self.args.vocab_path) self.vocab_size = len(self.tokenizer) self.pad_id = self.tokenizer.convert_tokens_to_ids('[PAD]') # 对原始数据进行预处理,将原始语料转换成对应的token_id if self.args.raw: for subset in ['train', 'valid', 'test']: self.preprocess_raw_data(subset) # 加载tokenized data self.subset2data = {} with open(self.args.test_tokenized_path, "r", encoding="utf8") as f: self.subset2data['test'] = f.read() if not self.args.do_eval: with open(self.args.train_tokenized_path, "r", encoding="utf8") as f: self.subset2data['train'] = f.read() with open(self.args.valid_tokenized_path, "r", encoding="utf8") as f: self.subset2data['valid'] = f.read() # 这一步是干啥的 for subset in self.subset2data: self.subset2data[subset] = self.subset2data[subset].split("\n") self.logger.info("Train/Valid/Test set has {} convs".format( [len(self.subset2data[subset]) for subset in self.subset2data]))
def pre_proc(self, examples): self.max_seq_length = 384 self.max_query_length = 64 self.doc_stride = 128 eval_features = [] cache_path = 'eval_features.pickle' # Load features if cached, convert from examples otherwise. if os.path.exists(cache_path): log.info("Loading cached features from '%s'..." % cache_path) with open(cache_path, 'rb') as cache_file: eval_features = pickle.load(cache_file) else: log.info("Creating tokenizer...") tokenizer = BertTokenizer(self.vocab_path) log.info("Converting examples to features...") def append_feature(feature): eval_features.append(feature) convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=self.max_seq_length, doc_stride=self.doc_stride, max_query_length=self.max_query_length, is_training=False, output_fn=append_feature, verbose_logging=False) log.info("Caching features at '%s'..." % cache_path) with open(cache_path, 'wb') as cache_file: pickle.dump(eval_features, cache_file) print("len(eval_features)", len(eval_features)) return eval_features
def test_plm(self): tokenizer = BertTokenizer(self.vocab_file) no_pad_features = [{ "input_ids": list(range(10)) }, { "input_ids": list(range(10)) }] pad_features = [{ "input_ids": list(range(5)) }, { "input_ids": list(range(10)) }] data_collator = DataCollatorForPermutationLanguageModeling(tokenizer) batch = data_collator(pad_features) self.assertIsInstance(batch, dict) self.assertEqual(batch["input_ids"].shape, torch.Size((2, 10))) self.assertEqual(batch["perm_mask"].shape, torch.Size((2, 10, 10))) self.assertEqual(batch["target_mapping"].shape, torch.Size( (2, 10, 10))) self.assertEqual(batch["labels"].shape, torch.Size((2, 10))) batch = data_collator(no_pad_features) self.assertIsInstance(batch, dict) self.assertEqual(batch["input_ids"].shape, torch.Size((2, 10))) self.assertEqual(batch["perm_mask"].shape, torch.Size((2, 10, 10))) self.assertEqual(batch["target_mapping"].shape, torch.Size( (2, 10, 10))) self.assertEqual(batch["labels"].shape, torch.Size((2, 10))) example = [torch.randint(5, [5])] with self.assertRaises(ValueError): # Expect error due to odd sequence length data_collator(example)
def __init__(self, path, vocab_path): self.path = path self.csv = pd.read_csv(self.path, encoding="cp949") self.normalize() self.param = self.csv.keys() self.data = self.csv.to_numpy().astype(np.int32).astype(str) self.train_data = None self.val_data = None self.test_data = None self.train_labels = None self.val_labels = None self.test_labels = None self.split_data() self.tokenizer = BertTokenizer(vocab_file=vocab_path, tokenize_chinese_chars=False) self.train_encoding = self.tokenizer(*get_token_param(self.train_data), return_tensors="pt") self.val_encoding = self.tokenizer(*get_token_param(self.val_data), return_tensors="pt") self.test_encoding = self.tokenizer(self.test_data[:, :-7].tolist(), self.test_data[:, -7:].tolist(), return_tensors="pt") self.train_dataset = FoodDataset(self.train_encoding, self.train_labels) self.vat_dataset = FoodDataset(self.val_encoding, self.val_labels) self.test_dataset = FoodDataset(self.test_encoding, self.test_labels)
def __init__(self, image_root: str, scibert_path: str, lazy: bool = False, limit: int = None, max_sequence_length: int = 512, different_type_for_refs: bool = True, use_refs: bool = True): super().__init__(lazy) self.image_root = image_root config = BertConfig.from_json_file( os.path.join(scibert_path, 'config.json')) self.tokenizer = BertTokenizer(config=config, vocab_file=os.path.join( scibert_path, 'vocab.txt')) self.token_indexer = { 'tokens': BertFromConfigIndexer(config=config, vocab_path=os.path.join( scibert_path, 'vocab.txt'), namespace='bert_tokens') } expected_img_size = 224 self.image_transform = transforms.Compose([ transforms.Resize(expected_img_size), transforms.CenterCrop(expected_img_size), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) self.use_refs = use_refs self.different_type_for_refs = different_type_for_refs self.limit = limit self.max_sequence_length = max_sequence_length self.word_tokenizer = WordTokenizer() self.caption_field = "caption"
def acs_predict(): dataset_dir = "../../datasets/acs-20210530-gold" # tokenizer tokenizer = BertTokenizer( "../../weights/biobert-pt-v1.0-pubmed-pmc/vocab.txt", do_lower_case=False) net = EndToEnd("../../weights/biobert-pt-v1.0-pubmed-pmc") net.load_state_dict( torch.load("../../weights/chemprot-cls-end-to-end/3layer-e2e-2")) net = net.cuda() net.eval() for pub_num in tqdm(os.listdir( dataset_dir)): # find all data folder in the dataset directory article_dir = os.path.join(dataset_dir, pub_num) assert os.path.isdir(article_dir) dataset = ACSDataset(data_path=os.path.join(article_dir, "re_input.tsv"), tokenizer=tokenizer, max_seq_len=128) dataloader = DataLoader(dataset=dataset, batch_size=256, num_workers=8, shuffle=False, collate_fn=acs_collate_fn) output = predict_net(pub_num, net, dataloader) with open(os.path.join(article_dir, "re_output.tsv"), "w", encoding="utf8") as fout: fout.write("id1\tid2\tclass\tconfidence\n") for _, (id1, id2, pred, score) in enumerate(output): fout.write(f"{id1}\t{id2}\t{pred}\t{score}\n")