Exemple #1
0
def tmp():
    config = "configs/bert_pretrain.jsonnet"
    serialization_dir = "models"
    output_dir = "bert_out"
    tokenizer_conllu_path = "data/coptic/converted/train"
    documents = read_conllu_files(tokenizer_conllu_path)
    sentences = []
    for document in documents:
        for sentence in document:
            sentences.append(" ".join([t['form'] for t in sentence]))
    print("Training tokenizer...")
    os.environ["TOKENIZER_PATH"] = output_dir

    t = train_bert_tokenizer(sentences,
                             serialize_path=output_dir,
                             vocab_size=6000)
    tok = PretrainedTransformerTokenizer("./bert_out/")
    idx = PretrainedTransformerMismatchedIndexer("./bert_out/")
    vocab = Vocabulary()
    vocab.set_from_file("bert_out/vocab.txt",
                        oov_token="[UNK]",
                        is_padded=True)
    s = tok.tokenize(sentences[1])
    i = idx.tokens_to_indices(s, vocab)
    i
    print(t)
Exemple #2
0
def build_vocab(instances: Iterable[Instance]) -> Vocabulary:
    print("Building the vocabulary")

    ret = Vocabulary()

    ret.set_from_file(filename=f"{cur_dir}/iwslt14/vocab.en",  namespace="source_tokens")
    ret.set_from_file(filename=f"{cur_dir}/iwslt14/vocab.de",  namespace="target_tokens")

    return ret
Exemple #3
0
def save_vocab_in_allennlp_format():
    first_data_filepath = filepaths_of_data_to_train_on[0]
    numless_vocab_file = first_data_filepath[:first_data_filepath.rfind(
        '.')] + vocabword_ind_not_numbered_file_ending
    numless_label_file = first_data_filepath[:first_data_filepath.rfind(
        '.')] + label_ind_not_numbered_file_ending

    vocab = Vocabulary()
    vocab.set_from_file(numless_vocab_file,
                        is_padded=True,
                        oov_token=unk_token,
                        namespace='tokens')
    vocab.set_from_file(numless_label_file,
                        is_padded=False,
                        namespace='labels')
    vocab.save_to_files(dir_to_save_vocab_in)
Exemple #4
0
class VisualQATrainDataset(VisualQADataset):
    def __init__(self, train_images_result_file, train_qa_result_file,
                 train_filenames_result_file, vocab_result_file):
        super().__init__()
        self.init_common(filenames_result_file=train_filenames_result_file,
                         qa_result_file=train_qa_result_file,
                         images_result_file=train_images_result_file)
        self.vocab = Vocabulary()
        self.vocab.set_from_file(filename=vocab_result_file, oov_token="[UNK]")
        possible_answers = self.qa.answer.value_counts()
        self.answer_vocabulary = {
            ans: idx
            for idx, ans in enumerate(possible_answers.index)
        }

    def __getitem__(self, idx):
        info = self.qa.iloc[idx]
        answer = info['answer']
        image = self.preprocessed_imgs[self.image_id_to_index[
            info['image_id']]]
        question = self.text_to_instance(info["preprocessed_question"])
        return question, image, self.answer_vocabulary[answer]
def set_vocab_from_filename(vocab: Vocabulary, namespace_filename: str,
                            load_dir: str, non_padded_namespaces: str):
    """Set up the vocabulary from a file

    Arguments:
        vocab: The vocabulary
        namespace_filename: The file containing all the namespaces
            to be loaded
        load_dir: The directory to load the vocab from
        non_padded_namespaces: The namespaces that are not padded
            (like labels etc)
    Returns:
        ``Vocabulary``: The loaded vocabulary
    """
    namespace = namespace_filename.replace('.txt', '')
    if any(
            namespace_match(pattern, namespace)
            for pattern in non_padded_namespaces):
        is_padded = False
    else:
        is_padded = True
    filename = os.path.join(load_dir, namespace_filename)
    vocab.set_from_file(filename, is_padded, namespace=namespace)
    return vocab
Exemple #6
0
from resolution.common.data.reader import BertSpanResolutionReader, BertWordSpanResolutionReader
from resolution.common.models import BertSpanPointerResolution


bert_path = "/home/zs261988/models/ptms/bert_rbt3_pytorch/"
pretrained_file = "/home/zs261988/models/mask_resolution/bert_rbt3_bs_task_expand/"
max_turn_len = 3
max_length = 256

validation_data_path="/home/zs261988/data/rewrite/business/mask_alipay_val.txt"

# 构建词表
print("加载词表.........")
vocab = Vocabulary(padding_token="[PAD]", oov_token="[UNK]")
vocab.set_from_file(bert_path + "vocab.txt",
                    is_padded=False, oov_token="[UNK]", namespace="bert_tags")

# 构架reader和模型
print("定义模型........")
reader = BertSpanResolutionReader(model_name=bert_path,
                                  max_turn_len=max_turn_len,
                                  max_length=max_length)
model = BertSpanPointerResolution(vocab=vocab,
                                  model_name=bert_path,
                                  max_turn_len=max_turn_len,
                                  task_pretrained_file=Path(pretrained_file) / "best.th")
model = model.eval()

# 读取测试集数据
instances = reader.read(validation_data_path)
instances.vocab = vocab
Exemple #7
0
    def __init__(self, 
                 vocab: Vocabulary,
                 student_xlm: TextFieldEmbedder,
                 teacher_xlm: TextFieldEmbedder,
                 labels_vocab_file: str,
                 training_tasks: Any,
                 validation_tasks: Any,
                 teacher_nli_head: FeedForward,
                 projector_feedforward: FeedForward = None,
                 loss: str = "l1",
                 reduction: str = "mean",
                 training_tasks_2print: List[str] = ["en", "de", "ru", "fr", "ur", "sw"],
                 valid_langs_2print: List[str]= ["en", "de", "ur", "sw", "ru"],
                 dropout: float = 0.0,
                 regularizer: Optional[RegularizerApplicator] = None,
                 feed_lang_ids: bool = True,
                 avg: bool = False) -> None:

        vocab.set_from_file(filename=labels_vocab_file, is_padded=False, namespace="labels")

        super(Aligner, self).__init__(vocab, regularizer)

        self._avg = avg

        self._teacher_xlm = teacher_xlm
        self._student_xlm = student_xlm

        self._teacher_nli_head = teacher_nli_head
        
        self._projector_feedforward = projector_feedforward
        if projector_feedforward is not None:
            assert projector_feedforward.get_input_dim() == student_xlm.get_output_dim()
            assert projector_feedforward.get_output_dim() == teacher_xlm.get_output_dim()

        if type(training_tasks) == dict:
            self._training_tasks = list(training_tasks.keys())
        else:
            self._training_tasks = training_tasks

        if type(validation_tasks) == dict:
            self._validation_tasks = list(validation_tasks.keys())
        else:
            self._validation_tasks = validation_tasks

        # self._src_embedder =  

        self._dropout = torch.nn.Dropout(p=dropout)
        
        if loss == "l1":
            self._loss = torch.nn.L1Loss(reduction=reduction)
        elif loss == "mse":
            self._loss = torch.nn.MSELoss(reduction=reduction)
        elif loss == "cos":
            self._loss = torch.nn.CosineEmbeddingLoss(reduction=reduction)
        elif loss == "smooth_l1":
            self._loss = torch.nn.SmoothL1Loss(reduction=reduction)
        else:
            raise NotImplementedError # TODO: try margin based losses

        self._per_lang_align_loss: Dict[str, Average] = dict()
        
        for taskname in self._training_tasks:
            # this will hide some metrics from tqdm, but they will still be computed
            self._per_lang_align_loss[taskname] = Average()
        self._avg_loss = Average()
        
        self._langs_pring_train = training_tasks_2print or "en"
        self._langs_print_val = valid_langs_2print
        if '*' in self._langs_pring_train:
            self._langs_pring_train = [t.split("")[-1] for t in training_tasks] 
        
        self._feed_lang_ids = feed_lang_ids   

        self._nli_per_lang_acc: Dict[str, CategoricalAccuracy] = dict()
        for taskname in self._validation_tasks:
            # this will hide some metrics from tqdm, but they will still be computed
            self._nli_per_lang_acc[taskname] = CategoricalAccuracy()
        self._nli_avg_acc = Average()
from pathlib import Path
from allennlp.data import DataLoader, Vocabulary
from allennlp.data.samplers import BucketBatchSampler

from resolution.common.data.reader import BertSpanResolutionReader

basename = "/home/zs261988/"
data_path = "data/"
model_path = "models/ptms/"
model_name = "albert_void_tiny/"
vocab_file = "vocab.txt"
sample_file = "rewrite/mask/sample_100.txt"

vocab = Vocabulary(padding_token="[PAD]", oov_token="[UNK]")
vocab.set_from_file(basename + model_path + model_name + vocab_file,
                    is_padded=False,
                    oov_token="[UNK]",
                    namespace="bert_tags")
reader = BertSpanResolutionReader(model_name=basename + model_path +
                                  model_name)

train_data = reader.read(basename + data_path + sample_file)
train_data.vocab = vocab

print("[PAD]: ", vocab.get_token_index("[PAD]", namespace="bert_tags"))
print("[CLS]: ", vocab.get_token_index("[CLS]", namespace="bert_tags"))
print("[SEP]: ", vocab.get_token_index("[SEP]", namespace="bert_tags"))

datasampler = BucketBatchSampler(train_data, batch_size=16)

dataloader = DataLoader(dataset=train_data, batch_sampler=datasampler)