Esempio n. 1
0
    def __init__(self,
                 vocab_file,
                 max_len=None,
                 do_basic_tokenize=True,
                 never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
        """Constructs a BertTokenizer.

        Args:
          vocab_file: Path to a one-wordpiece-per-line vocabulary file
          do_basic_tokenize: Whether to do basic tokenization before wordpiece.
          max_len: An artificial maximum length to truncate tokenized sequences to;
                         Effective maximum length is always the minimum of this
                         value (if specified) and the underlying BERT model's
                         sequence length.
          never_split: List of tokens which will never be split during tokenization.
                         Only has an effect when do_wordpiece_only=False
        """
        if not os.path.isfile(vocab_file):
            raise ValueError(
                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
                "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
                .format(vocab_file))
        self.vocab = load_vocab(vocab_file)
        self.ids_to_tokens = collections.OrderedDict([
            (ids, tok) for tok, ids in self.vocab.items()
        ])
        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
        self.max_len = max_len if max_len is not None else int(1e12)
Esempio n. 2
0
 def from_config(cls, config: Config):
     basic_tokenizer = create_component(
         ComponentType.TOKENIZER, config.basic_tokenizer
     )
     vocab = load_vocab(config.wordpiece_vocab_path)
     wordpiece_tokenizer = WordpieceTokenizer(vocab=vocab)
     return cls(vocab, basic_tokenizer, wordpiece_tokenizer)
Esempio n. 3
0
 def __init__(self, min_char: int, vocab_file: str, lower: bool,
              add_sentence_boundary: bool, add_word_boundary: bool,
              use_cuda: bool):
     super(WordPieceBatch,
           self).__init__(min_char=min_char,
                          lower=lower,
                          add_sentence_boundary=add_sentence_boundary,
                          add_word_boundary=add_word_boundary,
                          use_cuda=use_cuda)
     self.vocab = load_vocab(vocab_file=vocab_file)
     self.tokenizer = WordpieceTokenizer(vocab=self.vocab)
Esempio n. 4
0
    def __init__(self, vocab_file, do_lower_case=False, max_len=None):
        if not os.path.isfile(vocab_file):
            raise ValueError(
                "Can't find a vocabulary file at path '{}'."
                "To load the vocabulary from a Google pretrained "
                "model use "
                "`tokenizer = "
                "BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
                    vocab_file))

        self.vocab = tokenization.load_vocab(vocab_file)
        self.ids_to_tokens = OrderedDict([(ids, tok)
                                          for tok, ids in self.vocab.items()])
        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
        self.max_len = max_len if max_len is not None else int(1e12)
Esempio n. 5
0
    def _wordpiece(self, text, unit="text"):
        """
        ex) Hello World -> ['Hello', 'World'] -> ['He', '##llo', 'Wo', '##rld']
        """
        if self.subword_tokenizer is None:
            vocab_path = self.data_handler.read(self.config["vocab_path"], return_path=True)
            vocab = load_vocab(vocab_path)
            self.subword_tokenizer = WordpieceTokenizer(vocab)

        tokens = []

        if unit == "word":
            for sub_token in self.subword_tokenizer.tokenize(text):
                tokens.append(sub_token)
        else:
            for token in self.word_tokenizer.tokenize(text):
                for sub_token in self.subword_tokenizer.tokenize(token):
                    tokens.append(sub_token)

        return tokens
 def __init__(self,
              vocab_file,
              do_lower_case=True,
              max_len=None,
              never_split=["[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"],
              add_special_tokens=[]):
     #add_special_tokens, and add them to never split
     never_split = never_split + add_special_tokens
     if not os.path.isfile(vocab_file):
         raise ValueError(
             "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
             "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
             .format(vocab_file))
     self.vocab = load_vocab(vocab_file,
                             add_special_tokens=add_special_tokens)
     self.ids_to_tokens = collections.OrderedDict([
         (ids, tok) for tok, ids in self.vocab.items()
     ])
     self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
                                           never_split=never_split)
     self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
     self.max_len = max_len if max_len is not None else int(1e12)
Esempio n. 7
0
    def __init__(self, drop_rate=0, gpu=True):
        super(TowardModel, self).__init__()

        self.UNK_IDX = 1
        self.PAD_IDX = 2
        self.START_IDX = 3
        self.EOS_IDX = 4
        self.MAX_SENT_LEN = 30
        self.gpu = gpu

        self.n_vocab = 6222
        self.emb_dim = 768

        self.vocab = load_vocab(
            '/DATA/joosung/pytorch_pretrained_BERT_master/korea_vocab.txt')
        self.pos2token = {}
        for k, v in self.vocab.items():
            self.pos2token[v] = k

        self.word_dim = self.emb_dim  # 768
        self.word_emb = nn.Embedding(self.n_vocab, self.word_dim, self.PAD_IDX)
        """
        Discriminator(classifier)
        """
        self.channel_out = 100
        self.conv2d_2 = nn.Conv2d(1, self.channel_out, (2, self.emb_dim))
        self.conv2d_3 = nn.Conv2d(1, self.channel_out, (3, self.emb_dim))
        self.conv2d_4 = nn.Conv2d(1, self.channel_out, (4, self.emb_dim))
        self.conv2d_5 = nn.Conv2d(1, self.channel_out, (5, self.emb_dim))
        self.fc_drop = nn.Dropout(drop_rate)
        self.disc_fc = nn.Linear(4 * self.channel_out, 2)

        ## parameters
        # self.matrix_A.parameters()
        self.cls_params = list(self.conv2d_2.parameters())+list(self.conv2d_3.parameters())+list(self.conv2d_4.parameters())\
        +list(self.conv2d_5.parameters())+list(self.disc_fc.parameters())+list(self.word_emb.parameters())
Esempio n. 8
0
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME
from pytorch_pretrained_bert.modeling import BertForMaskedLM, BertConfig, BertForPreTraining
from pytorch_pretrained_bert.tokenization import BertTokenizer
from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule

## **** load label description
import biLSTM.encoder.data_loader as biLSTM_data_loader
import biLSTM.encoder.encoder_model as biLSTM_encoder_model
import biLSTM.encoder.entailment_model as biLSTM_entailment_model
import biLSTM.encoder.bi_lstm_model as bi_lstm_model

MAX_SEQ_LEN_LABEL_DEF = 512 ## max len for GO def (probably can be smaller)

if args.w2v_emb is not None: ## we can just treat each node as a vector without word description 
  Vocab = load_vocab(args.vocab_list) # all words found in pubmed and trained in w2v ... should trim down

## reading in feature label is in @GCN folder. too lazy to port this function out.
LabelDescLoader = GCN_data_loader.LabelProcessor()

if args.tree:
  # @label_in_ontology to get GO in the whole ontology, will be needed if we use tree method
  LabelSamples = LabelDescLoader.get_examples(args.data_dir, label_array=label_in_ontology)
  LabelSamples = GCN_data_loader.convert_labels_to_features(LabelSamples, MAX_SEQ_LEN_LABEL_DEF, Vocab, all_name_array=label_in_ontology, tokenize_style='space')

else:
  ## only get vectors for labels we want.
  LabelSamples = LabelDescLoader.get_examples(args.data_dir, label_array=label_to_test)
  LabelSamples = GCN_data_loader.convert_labels_to_features(LabelSamples, MAX_SEQ_LEN_LABEL_DEF, Vocab, all_name_array=label_to_test, tokenize_style='space')

Esempio n. 9
0
from itertools import chain
from tqdm import tqdm

import sys
sys.path.insert(0, "/DATA/joosung/pytorch_pretrained_BERT_master")
from pytorch_pretrained_bert.tokenization import load_vocab
from pytorch_bert_embedding import *
import torch.optim as optim

bert_model, bert_tokenizer = bert_model_load('bert-base-multilingual-cased')

n_iter = 50
vocab_size = 6222
mb_size = 1

vocab = load_vocab(
    '/DATA/joosung/pytorch_pretrained_BERT_master/korea_vocab.txt')
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
gpu_use = True


def main():
    from model import TowardModel
    model = TowardModel(gpu=gpu_use)
    model_name = 'simple_model_50000'
    model.load_state_dict(torch.load('models/{}'.format(model_name)))

    model = model.to(device)
    model.eval()

    f = open("../../sentiment_data/nsmc-master/ratings_train.txt", 'r')
    lines = f.readlines()
Esempio n. 10
0
import torch
import argparse
from tqdm import tqdm, trange
import os
import re

base_path = os.path.dirname(os.path.abspath(__file__))

tokenizer = BertTokenizer(vocab_file='{}/data/vocab.txt'.format(base_path),
                          do_lower_case=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForMaskedLM.from_pretrained('checkpoint/')
model.to(device)
model.eval()

vocab = load_vocab('{}/data/vocab.txt'.format(base_path))
inv_vocab = {v: k for k, v in vocab.items()}


def getMI(sentence):
    tokens = tokenizer.tokenize(sentence)
    tokens.insert(0, "[CLS]")
    tokens.append("[SEP]")
    tokens_length = len(tokens)
    result = []
    for i, token in enumerate(tokens):
        # tokens preprocessing
        if i != 0 and i != tokens_length - 1:
            tokens[i] = '[MASK]'

        ids = tokenizer.convert_tokens_to_ids(tokens)
Esempio n. 11
0
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch
import argparse
from tqdm import tqdm, trange
import os

base_path = os.path.dirname(os.path.abspath(__file__))

tokenizer = BertTokenizer(vocab_file='{}/data/vocab.txt'.format(base_path),
                          do_lower_case=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
model.to(device)
model.eval()

vocab = load_vocab(vocab_file='{}/data/vocab.txt'.format(base_path))
inv_vocab = {v: k for k, v in vocab.items()}


def getMI(sentence):
    tokens = tokenizer.tokenize(sentence)

    tokens.insert(0, "[CLS]")
    tokens.append("[SEP]")

    tokens_length = len(tokens)

    ids = tokenizer.convert_tokens_to_ids(tokens)

    if (len(ids) > 128):
        ids = ids[0:128]
Esempio n. 12
0
    ap.add_argument("-vp",
                    "--vocab_path",
                    help="Location of vocab for training.")
    ap.add_argument("-sd", "--save_dir", help="Location to save the model.")
    # to continue training models
    ap.add_argument("-cp",
                    "--continue_path",
                    help="Path to model for warm start.")
    ap.add_argument("-ce",
                    "--continue_epoch",
                    type=int,
                    help="Epoch of model for ward start.")
    args = vars(ap.parse_args())

    metadata = pd.read_csv(args['metadata_path'])
    vocab = list(load_vocab(args['vocab_path']).keys())
    train_dataset = PinterestPretrainDataset(metadata, vocab, split='train')
    val_dataset = PinterestPretrainDataset(metadata, vocab, split='val')

    mcbert = MCBertForPretraining(vis_feat_dim=args['vis_feat_dim'],
                                  spatial_size=args['spatial_size'],
                                  hidden_dim=args['hidden_dim'],
                                  cmb_feat_dim=args['cmb_feat_dim'],
                                  kernel_size=args['kernel_size'],
                                  batch_size=args['batch_size'],
                                  learning_rate=args['learning_rate'],
                                  warmup_proportion=args['warmup_proportion'],
                                  num_epochs=args['num_epochs'])

    if args['continue_path'] and args['continue_epoch']:
        mcbert.load(args['continue_path'], args['continue_epoch'],
Esempio n. 13
0
def test_WordpieceTokenizer():
    model = WordpieceTokenizer(
        tokenization.load_vocab(
            os.path.join(model_dir, "bert-base-cased-vocab.txt")))
    print(model.tokenize("decomposition deoomposition"))