def __init__(self, vocab_file, max_len=None, do_basic_tokenize=True, never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")): """Constructs a BertTokenizer. Args: vocab_file: Path to a one-wordpiece-per-line vocabulary file do_basic_tokenize: Whether to do basic tokenization before wordpiece. max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the minimum of this value (if specified) and the underlying BERT model's sequence length. never_split: List of tokens which will never be split during tokenization. Only has an effect when do_wordpiece_only=False """ if not os.path.isfile(vocab_file): raise ValueError( "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained " "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`" .format(vocab_file)) self.vocab = load_vocab(vocab_file) self.ids_to_tokens = collections.OrderedDict([ (ids, tok) for tok, ids in self.vocab.items() ]) self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) self.max_len = max_len if max_len is not None else int(1e12)
def from_config(cls, config: Config): basic_tokenizer = create_component( ComponentType.TOKENIZER, config.basic_tokenizer ) vocab = load_vocab(config.wordpiece_vocab_path) wordpiece_tokenizer = WordpieceTokenizer(vocab=vocab) return cls(vocab, basic_tokenizer, wordpiece_tokenizer)
def __init__(self, min_char: int, vocab_file: str, lower: bool, add_sentence_boundary: bool, add_word_boundary: bool, use_cuda: bool): super(WordPieceBatch, self).__init__(min_char=min_char, lower=lower, add_sentence_boundary=add_sentence_boundary, add_word_boundary=add_word_boundary, use_cuda=use_cuda) self.vocab = load_vocab(vocab_file=vocab_file) self.tokenizer = WordpieceTokenizer(vocab=self.vocab)
def __init__(self, vocab_file, do_lower_case=False, max_len=None): if not os.path.isfile(vocab_file): raise ValueError( "Can't find a vocabulary file at path '{}'." "To load the vocabulary from a Google pretrained " "model use " "`tokenizer = " "BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format( vocab_file)) self.vocab = tokenization.load_vocab(vocab_file) self.ids_to_tokens = OrderedDict([(ids, tok) for tok, ids in self.vocab.items()]) self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) self.max_len = max_len if max_len is not None else int(1e12)
def _wordpiece(self, text, unit="text"): """ ex) Hello World -> ['Hello', 'World'] -> ['He', '##llo', 'Wo', '##rld'] """ if self.subword_tokenizer is None: vocab_path = self.data_handler.read(self.config["vocab_path"], return_path=True) vocab = load_vocab(vocab_path) self.subword_tokenizer = WordpieceTokenizer(vocab) tokens = [] if unit == "word": for sub_token in self.subword_tokenizer.tokenize(text): tokens.append(sub_token) else: for token in self.word_tokenizer.tokenize(text): for sub_token in self.subword_tokenizer.tokenize(token): tokens.append(sub_token) return tokens
def __init__(self, vocab_file, do_lower_case=True, max_len=None, never_split=["[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"], add_special_tokens=[]): #add_special_tokens, and add them to never split never_split = never_split + add_special_tokens if not os.path.isfile(vocab_file): raise ValueError( "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained " "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`" .format(vocab_file)) self.vocab = load_vocab(vocab_file, add_special_tokens=add_special_tokens) self.ids_to_tokens = collections.OrderedDict([ (ids, tok) for tok, ids in self.vocab.items() ]) self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case, never_split=never_split) self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) self.max_len = max_len if max_len is not None else int(1e12)
def __init__(self, drop_rate=0, gpu=True): super(TowardModel, self).__init__() self.UNK_IDX = 1 self.PAD_IDX = 2 self.START_IDX = 3 self.EOS_IDX = 4 self.MAX_SENT_LEN = 30 self.gpu = gpu self.n_vocab = 6222 self.emb_dim = 768 self.vocab = load_vocab( '/DATA/joosung/pytorch_pretrained_BERT_master/korea_vocab.txt') self.pos2token = {} for k, v in self.vocab.items(): self.pos2token[v] = k self.word_dim = self.emb_dim # 768 self.word_emb = nn.Embedding(self.n_vocab, self.word_dim, self.PAD_IDX) """ Discriminator(classifier) """ self.channel_out = 100 self.conv2d_2 = nn.Conv2d(1, self.channel_out, (2, self.emb_dim)) self.conv2d_3 = nn.Conv2d(1, self.channel_out, (3, self.emb_dim)) self.conv2d_4 = nn.Conv2d(1, self.channel_out, (4, self.emb_dim)) self.conv2d_5 = nn.Conv2d(1, self.channel_out, (5, self.emb_dim)) self.fc_drop = nn.Dropout(drop_rate) self.disc_fc = nn.Linear(4 * self.channel_out, 2) ## parameters # self.matrix_A.parameters() self.cls_params = list(self.conv2d_2.parameters())+list(self.conv2d_3.parameters())+list(self.conv2d_4.parameters())\ +list(self.conv2d_5.parameters())+list(self.disc_fc.parameters())+list(self.word_emb.parameters())
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME from pytorch_pretrained_bert.modeling import BertForMaskedLM, BertConfig, BertForPreTraining from pytorch_pretrained_bert.tokenization import BertTokenizer from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule ## **** load label description import biLSTM.encoder.data_loader as biLSTM_data_loader import biLSTM.encoder.encoder_model as biLSTM_encoder_model import biLSTM.encoder.entailment_model as biLSTM_entailment_model import biLSTM.encoder.bi_lstm_model as bi_lstm_model MAX_SEQ_LEN_LABEL_DEF = 512 ## max len for GO def (probably can be smaller) if args.w2v_emb is not None: ## we can just treat each node as a vector without word description Vocab = load_vocab(args.vocab_list) # all words found in pubmed and trained in w2v ... should trim down ## reading in feature label is in @GCN folder. too lazy to port this function out. LabelDescLoader = GCN_data_loader.LabelProcessor() if args.tree: # @label_in_ontology to get GO in the whole ontology, will be needed if we use tree method LabelSamples = LabelDescLoader.get_examples(args.data_dir, label_array=label_in_ontology) LabelSamples = GCN_data_loader.convert_labels_to_features(LabelSamples, MAX_SEQ_LEN_LABEL_DEF, Vocab, all_name_array=label_in_ontology, tokenize_style='space') else: ## only get vectors for labels we want. LabelSamples = LabelDescLoader.get_examples(args.data_dir, label_array=label_to_test) LabelSamples = GCN_data_loader.convert_labels_to_features(LabelSamples, MAX_SEQ_LEN_LABEL_DEF, Vocab, all_name_array=label_to_test, tokenize_style='space')
from itertools import chain from tqdm import tqdm import sys sys.path.insert(0, "/DATA/joosung/pytorch_pretrained_BERT_master") from pytorch_pretrained_bert.tokenization import load_vocab from pytorch_bert_embedding import * import torch.optim as optim bert_model, bert_tokenizer = bert_model_load('bert-base-multilingual-cased') n_iter = 50 vocab_size = 6222 mb_size = 1 vocab = load_vocab( '/DATA/joosung/pytorch_pretrained_BERT_master/korea_vocab.txt') device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") gpu_use = True def main(): from model import TowardModel model = TowardModel(gpu=gpu_use) model_name = 'simple_model_50000' model.load_state_dict(torch.load('models/{}'.format(model_name))) model = model.to(device) model.eval() f = open("../../sentiment_data/nsmc-master/ratings_train.txt", 'r') lines = f.readlines()
import torch import argparse from tqdm import tqdm, trange import os import re base_path = os.path.dirname(os.path.abspath(__file__)) tokenizer = BertTokenizer(vocab_file='{}/data/vocab.txt'.format(base_path), do_lower_case=True) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = BertForMaskedLM.from_pretrained('checkpoint/') model.to(device) model.eval() vocab = load_vocab('{}/data/vocab.txt'.format(base_path)) inv_vocab = {v: k for k, v in vocab.items()} def getMI(sentence): tokens = tokenizer.tokenize(sentence) tokens.insert(0, "[CLS]") tokens.append("[SEP]") tokens_length = len(tokens) result = [] for i, token in enumerate(tokens): # tokens preprocessing if i != 0 and i != tokens_length - 1: tokens[i] = '[MASK]' ids = tokenizer.convert_tokens_to_ids(tokens)
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler import torch import argparse from tqdm import tqdm, trange import os base_path = os.path.dirname(os.path.abspath(__file__)) tokenizer = BertTokenizer(vocab_file='{}/data/vocab.txt'.format(base_path), do_lower_case=True) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = BertForMaskedLM.from_pretrained('bert-base-uncased') model.to(device) model.eval() vocab = load_vocab(vocab_file='{}/data/vocab.txt'.format(base_path)) inv_vocab = {v: k for k, v in vocab.items()} def getMI(sentence): tokens = tokenizer.tokenize(sentence) tokens.insert(0, "[CLS]") tokens.append("[SEP]") tokens_length = len(tokens) ids = tokenizer.convert_tokens_to_ids(tokens) if (len(ids) > 128): ids = ids[0:128]
ap.add_argument("-vp", "--vocab_path", help="Location of vocab for training.") ap.add_argument("-sd", "--save_dir", help="Location to save the model.") # to continue training models ap.add_argument("-cp", "--continue_path", help="Path to model for warm start.") ap.add_argument("-ce", "--continue_epoch", type=int, help="Epoch of model for ward start.") args = vars(ap.parse_args()) metadata = pd.read_csv(args['metadata_path']) vocab = list(load_vocab(args['vocab_path']).keys()) train_dataset = PinterestPretrainDataset(metadata, vocab, split='train') val_dataset = PinterestPretrainDataset(metadata, vocab, split='val') mcbert = MCBertForPretraining(vis_feat_dim=args['vis_feat_dim'], spatial_size=args['spatial_size'], hidden_dim=args['hidden_dim'], cmb_feat_dim=args['cmb_feat_dim'], kernel_size=args['kernel_size'], batch_size=args['batch_size'], learning_rate=args['learning_rate'], warmup_proportion=args['warmup_proportion'], num_epochs=args['num_epochs']) if args['continue_path'] and args['continue_epoch']: mcbert.load(args['continue_path'], args['continue_epoch'],
def test_WordpieceTokenizer(): model = WordpieceTokenizer( tokenization.load_vocab( os.path.join(model_dir, "bert-base-cased-vocab.txt"))) print(model.tokenize("decomposition deoomposition"))