Esempio n. 1
0
def init_tgt(params):
    """
    Initialize the parameters of the target model
    """
    prob = None
    if params.prob:
        print(' | load word translation probs!')
        prob = torch.load(params.prob)

    print(f'| load English pre-trained model: {params.src_model}')
    config = AutoConfig.from_pretrained(params.src_model,
                                        cache_dir=params.cache_dir)
    model = AutoModelForMaskedLM.from_pretrained(
        params.src_model,
        from_tf=bool(".ckpt" in params.src_model),
        config=config,
        cache_dir=params.cache_dir,
    )
    if 'roberta' in params.src_model:
        assert params.src_merge, "merge file should be provided!"
        src_tokenizer = RobertaTokenizer(params.src_vocab, params.src_merge)
    else:
        # note that we do not lowercase here
        src_tokenizer = AutoTokenizer.from_pretrained(
            params.src_model,
            cache_dir='/home/georgios.vernikos/workspace/LMMT/MonoEgo/cache',
            use_fast=True)

    # get English word-embeddings and bias
    src_embs = model.base_model.embeddings.word_embeddings.weight.detach(
    ).clone()
    src_bias = model.cls.predictions.bias.detach().clone()

    # initialize target tokenizer, we always use BertWordPieceTokenizer for the target language
    tgt_tokenizer = BertTokenizerFast(vocab_file=params.tgt_vocab,
                                      do_lower_case=True,
                                      strip_accents=False)

    tgt_embs, tgt_bias = guess(src_embs,
                               src_bias,
                               tgt_tokenizer,
                               src_tokenizer,
                               prob=prob)

    # checksum for debugging purpose
    print(' checksum src | embeddings {:.5f} - bias {:.5f}'.format(
        src_embs.norm().item(),
        src_bias.norm().item()))
    model.base_model.embeddings.word_embeddings.weight.data = tgt_embs
    model.cls.predictions.bias.data = tgt_bias
    model.tie_weights()
    print(' checksum tgt | embeddings {:.5f} - bias {:.5f}'.format(
        model.base_model.embeddings.word_embeddings.weight.norm().item(),
        model.cls.predictions.bias.norm().item()))

    # save the model
    model_to_save = (model.module if hasattr(model, "module") else model
                     )  # Take care of distributed/parallel training
    model_to_save.save_pretrained(params.tgt_model)
Esempio n. 2
0
 def __init__(self, model_dir):
     self.model = RobertaForSequenceClassification(
         model_dir, output_attentions=True, output_hidden_states=True)
     self.tokenizer = RobertaTokenizer(
         model_dir,
         add_special_tokens=True,
         merges_file=os.path.join(model_dir, "merges.txt"),
     )
Esempio n. 3
0
class BaseDataset(ABC, IterableDataset, metaclass=DatasetRegistry):
    task = None
    tokenizer = RobertaTokenizer(vocab_file='vocabs/roberta-vocab.json', merges_file='vocabs/roberta-merges.txt')
    pad_token_id = tokenizer.convert_tokens_to_ids(['<pad>'])[0]
    mask_token_id = tokenizer.convert_tokens_to_ids(['<mask>'])[0]
    gap_token_id = tokenizer.convert_tokens_to_ids(['<gap>'])[0]

    def __init__(self, data_file, data_size, local_rank, world_size=None):
        if not os.path.isfile(data_file):
            raise FileNotFoundError(f'{data_file} does not exist or is a directory.')

        self.data_file = data_file
        self.size = data_size

        if local_rank == -1:
            self.start = 0
            self.step = 1
        else:
            self.start = local_rank
            self.step = world_size

    def __len__(self):
        return self.size

    def __iter__(self):
        file_iter = gzip.open(self.data_file, 'rt')
        islice_iter = islice(file_iter, self.start, None, self.step)
        processed_iter = map(self.process_line, islice_iter)
        return processed_iter

    @staticmethod
    def tokenize_first_segment(segment):
        return ['<s>'] + segment.split() + ['</s>']

    @staticmethod
    def tokenize_second_segment(segment):
        return ['</s>'] + segment.split() + ['</s>']

    @staticmethod
    def pad_2d(array_2d, pad_value=0):
        row_lengths = [len(row) for row in array_2d]
        max_len = max(row_lengths)
        for i in range(len(array_2d)):
            array_2d[i] += [pad_value for _ in range(max_len - row_lengths[i])]

        return array_2d

    @abstractmethod
    def process_line(self, line):
        pass

    @classmethod
    @abstractmethod
    def collate_fn(cls, batch):
        pass
 def __init__(self):
     super(ReviewModel, self).__init__()
     tokenizer = RobertaTokenizer(
             vocab_file = Constants.VOCAB_FILE,
             merges_file = Constants.MERGES_FILE,
             add_prefix_space = True
         )
     config = RobertaConfig(output_hidden_states = True)
     self.backbone = RobertaModel(config)
     self.backbone.resize_token_embeddings(len(tokenizer))
     self.fc = nn.Linear(in_features = config.hidden_size, out_features = 1, bias = True)
Esempio n. 5
0
def init_tgt(params):
    """
    Initialize the parameters of the target model
    """
    prob = None
    if params.prob:
        print(' | load word translation probs!')
        prob = torch.load(params.prob)

    print(f'| load English pre-trained model: {params.src_model}')
    model = torch.load(params.src_model)
    if 'roberta' in params.src_model:
        assert params.src_merge, "merge file should be provided!"
        src_tokenizer = RobertaTokenizer(params.src_vocab, params.src_merge)
    else:
        # note that we do not lowercase here
        src_tokenizer = BertTokenizer(params.src_vocab, do_lower_case=False)

    # get English word-embeddings and bias
    src_embs = model[MAP['word_embeddings']]
    src_bias = model[MAP['output_bias']]

    # initialize target tokenizer, we always use BertWordPieceTokenizer for the target language
    tgt_tokenizer = BertWordPieceTokenizer(params.tgt_vocab,
                                           unk_token=UNK_TOKEN,
                                           sep_token=SEP_TOKEN,
                                           cls_token=CLS_TOKEN,
                                           pad_token=PAD_TOKEN,
                                           mask_token=MASK_TOKEN,
                                           lowercase=False,
                                           strip_accents=False)

    tgt_embs, tgt_bias = guess(src_embs,
                               src_bias,
                               tgt_tokenizer,
                               src_tokenizer,
                               prob=prob)

    # checksum for debugging purpose
    print(' checksum src | embeddings {:.5f} - bias {:.5f}'.format(
        src_embs.norm().item(),
        src_bias.norm().item()))
    model[MAP['word_embeddings']] = tgt_embs
    model[MAP['output_bias']] = tgt_bias
    model[MAP['output_weight']] = model[MAP['word_embeddings']]
    print(' checksum tgt | embeddings {:.5f} - bias {:.5f}'.format(
        model[MAP['word_embeddings']].norm().item(),
        model[MAP['output_bias']].norm().item()))

    # save the model
    torch.save(model, params.tgt_model)
Esempio n. 6
0
    def _roberta(self, text, unit="text"):
        """
        ex)
        """
        if self.bpe_tokenizer is None:
            vocab_path = self.data_handler.read(self.config["vocab_path"],
                                                return_path=True)
            merges_path = self.data_handler.read(self.config["merges_path"],
                                                 return_path=True)
            del self.config["vocab_path"]
            del self.config["merges_path"]

            self.bpe_tokenizer = RobertaTokenizer(vocab_path, merges_path,
                                                  **self.config)

        return self.bpe_tokenizer._tokenize(text)
Esempio n. 7
0
def make_xnli_data(params):
    """
    read text file and tensorize
    text file input has the format "premise  hypothesis  label"
    """
    if params.merge:
        tokenizer = RobertaTokenizer(params.vocab, params.merge)
    else:
        tokenizer = BertTokenizer(params.vocab, do_lower_case=False)

    xs, ys = [], []
    labels = {'contradiction': 0, 'neutral': 1, 'entailment': 2}
    pad_index = tokenizer.pad_token_id
    with open(params.input, 'r') as f:
        next(f)
        for i, line in enumerate(f):
            cols = line.rstrip().split('\t')

            if len(cols) != 3: print(f"potential error at line {i}")

            enc1 = tokenizer.encode(cols[0])
            enc2 = tokenizer.encode(cols[1])
            xs.append(enc1 + enc2[1:-2])  # [CLS] p1 ... [SEP] h1 ...hn
            ys.append(labels[cols[2]])

        # convert data to tensor
        xs = [torch.LongTensor(s) for s in xs]
        xs = pad_sequence(xs, batch_first=True, padding_value=pad_index)

        ys = torch.LongTensor(ys)
    unk_index = tokenizer.unk_token_id
    n_unks = sum([(s == unk_index).sum().item() for s in xs])
    n_toks = sum([len(s) for s in xs])
    p_unks = n_unks * 100. / n_toks
    print(f"{n_toks} tokens - {p_unks:.2f}% unknown words")
    data = {'xs': xs, 'ys': ys, 'pad_index': pad_index}

    torch.save(data, params.output)
Esempio n. 8
0
                    help='Number of processes to use for data preprocessing.')
parser.add_argument('--data_dir',
                    type=str,
                    default='wikipedia/extracted',
                    help='Directory with data to preprocess.')
parser.add_argument('--save_dir',
                    type=str,
                    default='GT/text',
                    help='Directory for saving preprocessed data.')
parser.add_argument('--seed', type=int, default=111, help='Random seed.')
args = parser.parse_args()

nlp = spacy.load('en_core_web_sm', disable=['tagger', 'ner'])

tokenizer = RobertaTokenizer(vocab_file='../vocabs/roberta-vocab.json',
                             merges_file='../vocabs/roberta-merges.txt',
                             additional_special_tokens=['<gap>'])

GAP_TOKEN = '<gap>'
UNK_TOKEN = '<unk>'
MAX_PAIR_LENGTH = 508
LOWER = False

# Sentences that are too long will be split on these tokens
split_tokens_1 = {
    '.', '?', '!', ',', ':', ';', 'that', 'which', 'who', 'whom', 'whose',
    'when', 'where', 'of', 'for', 'from', 'was', 'is', 'are', 'were', 'and',
    'or', 'but', 'if', 'whether', 'while', 'because', 'though', 'as', 'to'
}
split_tokens_2 = {
    'what', 'instead', 'have', 'has', 'had', 'will', 'there', 'those', 'this',
Esempio n. 9
0
tags_vals.append('X')
tags_vals.append('[CLS]')
tags_vals.append('[SEP]')
tags_vals = set(tags_vals)

tag2idx = {t: i for i, t in enumerate(tags_vals)}
tag2name = {tag2idx[key]: key for key in tag2idx.keys()}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cude.device_count()
#torch.cuda.device_count()
tok_dir = "/GW/Health-Corpus/work/roberta-finetuning-ner/roberta-tokenizer/roberta-base-"

tokenizer = RobertaTokenizer(tok_dir + "vocab.json",
                             tok_dir + "merges.txt",
                             do_lower_case=False)

# %%
tokenized_texts = []
word_piece_labels = []
i_inc = 0
for word_list, label in (zip(sentences, labels)):
    temp_lable = []
    temp_token = []

    # Add [CLS] at the front
    temp_lable.append('[CLS]')
    temp_token.append('[CLS]')

    for word, lab in zip(word_list, label):