def tmp(): config = "configs/bert_pretrain.jsonnet" serialization_dir = "models" output_dir = "bert_out" tokenizer_conllu_path = "data/coptic/converted/train" documents = read_conllu_files(tokenizer_conllu_path) sentences = [] for document in documents: for sentence in document: sentences.append(" ".join([t['form'] for t in sentence])) print("Training tokenizer...") os.environ["TOKENIZER_PATH"] = output_dir t = train_bert_tokenizer(sentences, serialize_path=output_dir, vocab_size=6000) tok = PretrainedTransformerTokenizer("./bert_out/") idx = PretrainedTransformerMismatchedIndexer("./bert_out/") vocab = Vocabulary() vocab.set_from_file("bert_out/vocab.txt", oov_token="[UNK]", is_padded=True) s = tok.tokenize(sentences[1]) i = idx.tokens_to_indices(s, vocab) i print(t)
def build_vocab(instances: Iterable[Instance]) -> Vocabulary: print("Building the vocabulary") ret = Vocabulary() ret.set_from_file(filename=f"{cur_dir}/iwslt14/vocab.en", namespace="source_tokens") ret.set_from_file(filename=f"{cur_dir}/iwslt14/vocab.de", namespace="target_tokens") return ret
def save_vocab_in_allennlp_format(): first_data_filepath = filepaths_of_data_to_train_on[0] numless_vocab_file = first_data_filepath[:first_data_filepath.rfind( '.')] + vocabword_ind_not_numbered_file_ending numless_label_file = first_data_filepath[:first_data_filepath.rfind( '.')] + label_ind_not_numbered_file_ending vocab = Vocabulary() vocab.set_from_file(numless_vocab_file, is_padded=True, oov_token=unk_token, namespace='tokens') vocab.set_from_file(numless_label_file, is_padded=False, namespace='labels') vocab.save_to_files(dir_to_save_vocab_in)
class VisualQATrainDataset(VisualQADataset): def __init__(self, train_images_result_file, train_qa_result_file, train_filenames_result_file, vocab_result_file): super().__init__() self.init_common(filenames_result_file=train_filenames_result_file, qa_result_file=train_qa_result_file, images_result_file=train_images_result_file) self.vocab = Vocabulary() self.vocab.set_from_file(filename=vocab_result_file, oov_token="[UNK]") possible_answers = self.qa.answer.value_counts() self.answer_vocabulary = { ans: idx for idx, ans in enumerate(possible_answers.index) } def __getitem__(self, idx): info = self.qa.iloc[idx] answer = info['answer'] image = self.preprocessed_imgs[self.image_id_to_index[ info['image_id']]] question = self.text_to_instance(info["preprocessed_question"]) return question, image, self.answer_vocabulary[answer]
def set_vocab_from_filename(vocab: Vocabulary, namespace_filename: str, load_dir: str, non_padded_namespaces: str): """Set up the vocabulary from a file Arguments: vocab: The vocabulary namespace_filename: The file containing all the namespaces to be loaded load_dir: The directory to load the vocab from non_padded_namespaces: The namespaces that are not padded (like labels etc) Returns: ``Vocabulary``: The loaded vocabulary """ namespace = namespace_filename.replace('.txt', '') if any( namespace_match(pattern, namespace) for pattern in non_padded_namespaces): is_padded = False else: is_padded = True filename = os.path.join(load_dir, namespace_filename) vocab.set_from_file(filename, is_padded, namespace=namespace) return vocab
from resolution.common.data.reader import BertSpanResolutionReader, BertWordSpanResolutionReader from resolution.common.models import BertSpanPointerResolution bert_path = "/home/zs261988/models/ptms/bert_rbt3_pytorch/" pretrained_file = "/home/zs261988/models/mask_resolution/bert_rbt3_bs_task_expand/" max_turn_len = 3 max_length = 256 validation_data_path="/home/zs261988/data/rewrite/business/mask_alipay_val.txt" # 构建词表 print("加载词表.........") vocab = Vocabulary(padding_token="[PAD]", oov_token="[UNK]") vocab.set_from_file(bert_path + "vocab.txt", is_padded=False, oov_token="[UNK]", namespace="bert_tags") # 构架reader和模型 print("定义模型........") reader = BertSpanResolutionReader(model_name=bert_path, max_turn_len=max_turn_len, max_length=max_length) model = BertSpanPointerResolution(vocab=vocab, model_name=bert_path, max_turn_len=max_turn_len, task_pretrained_file=Path(pretrained_file) / "best.th") model = model.eval() # 读取测试集数据 instances = reader.read(validation_data_path) instances.vocab = vocab
def __init__(self, vocab: Vocabulary, student_xlm: TextFieldEmbedder, teacher_xlm: TextFieldEmbedder, labels_vocab_file: str, training_tasks: Any, validation_tasks: Any, teacher_nli_head: FeedForward, projector_feedforward: FeedForward = None, loss: str = "l1", reduction: str = "mean", training_tasks_2print: List[str] = ["en", "de", "ru", "fr", "ur", "sw"], valid_langs_2print: List[str]= ["en", "de", "ur", "sw", "ru"], dropout: float = 0.0, regularizer: Optional[RegularizerApplicator] = None, feed_lang_ids: bool = True, avg: bool = False) -> None: vocab.set_from_file(filename=labels_vocab_file, is_padded=False, namespace="labels") super(Aligner, self).__init__(vocab, regularizer) self._avg = avg self._teacher_xlm = teacher_xlm self._student_xlm = student_xlm self._teacher_nli_head = teacher_nli_head self._projector_feedforward = projector_feedforward if projector_feedforward is not None: assert projector_feedforward.get_input_dim() == student_xlm.get_output_dim() assert projector_feedforward.get_output_dim() == teacher_xlm.get_output_dim() if type(training_tasks) == dict: self._training_tasks = list(training_tasks.keys()) else: self._training_tasks = training_tasks if type(validation_tasks) == dict: self._validation_tasks = list(validation_tasks.keys()) else: self._validation_tasks = validation_tasks # self._src_embedder = self._dropout = torch.nn.Dropout(p=dropout) if loss == "l1": self._loss = torch.nn.L1Loss(reduction=reduction) elif loss == "mse": self._loss = torch.nn.MSELoss(reduction=reduction) elif loss == "cos": self._loss = torch.nn.CosineEmbeddingLoss(reduction=reduction) elif loss == "smooth_l1": self._loss = torch.nn.SmoothL1Loss(reduction=reduction) else: raise NotImplementedError # TODO: try margin based losses self._per_lang_align_loss: Dict[str, Average] = dict() for taskname in self._training_tasks: # this will hide some metrics from tqdm, but they will still be computed self._per_lang_align_loss[taskname] = Average() self._avg_loss = Average() self._langs_pring_train = training_tasks_2print or "en" self._langs_print_val = valid_langs_2print if '*' in self._langs_pring_train: self._langs_pring_train = [t.split("")[-1] for t in training_tasks] self._feed_lang_ids = feed_lang_ids self._nli_per_lang_acc: Dict[str, CategoricalAccuracy] = dict() for taskname in self._validation_tasks: # this will hide some metrics from tqdm, but they will still be computed self._nli_per_lang_acc[taskname] = CategoricalAccuracy() self._nli_avg_acc = Average()
from pathlib import Path from allennlp.data import DataLoader, Vocabulary from allennlp.data.samplers import BucketBatchSampler from resolution.common.data.reader import BertSpanResolutionReader basename = "/home/zs261988/" data_path = "data/" model_path = "models/ptms/" model_name = "albert_void_tiny/" vocab_file = "vocab.txt" sample_file = "rewrite/mask/sample_100.txt" vocab = Vocabulary(padding_token="[PAD]", oov_token="[UNK]") vocab.set_from_file(basename + model_path + model_name + vocab_file, is_padded=False, oov_token="[UNK]", namespace="bert_tags") reader = BertSpanResolutionReader(model_name=basename + model_path + model_name) train_data = reader.read(basename + data_path + sample_file) train_data.vocab = vocab print("[PAD]: ", vocab.get_token_index("[PAD]", namespace="bert_tags")) print("[CLS]: ", vocab.get_token_index("[CLS]", namespace="bert_tags")) print("[SEP]: ", vocab.get_token_index("[SEP]", namespace="bert_tags")) datasampler = BucketBatchSampler(train_data, batch_size=16) dataloader = DataLoader(dataset=train_data, batch_sampler=datasampler)