Beispiel #1
0
 def get_tokenizer(self):
     if self.hparams.model_type == 'bert':
         tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
     elif self.hparams.model_type == 'bert-cased':
         tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
     elif self.hparams.model_type == 'bert-large':
         tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
     elif self.hparams.model_type == 'distilbert':
         tokenizer = DistilBertTokenizer.from_pretrained(
             'distilbert-base-uncased')
     elif self.hparams.model_type == 'roberta':
         tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
     elif self.hparams.model_type == 'roberta-large':
         tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
     elif self.hparams.model_type == 'albert':
         tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
     elif self.hparams.model_type == 'albert-xxlarge':
         tokenizer = AlbertTokenizer.from_pretrained('albert-xxlarge-v2')
     elif self.hparams.model_type == 'electra':
         tokenizer = ElectraTokenizer.from_pretrained(
             'google/electra-base-discriminator')
     elif self.hparams.model_type == 'electra-large':
         tokenizer = ElectraTokenizer.from_pretrained(
             'google/electra-large-discriminator')
     else:
         raise ValueError
     return tokenizer
Beispiel #2
0
    def __init__(self,
                 document_store: BaseDocumentStore,
                 max_seq_len: int = 256,
                 use_gpu: bool = True,
                 batch_size: int = 16,
                 embed_title: bool = True,
                 remove_sep_tok_from_untitled_passages: bool = True):
        """
        Init the Retriever incl. the two encoder models from a local or remote model checkpoint.
        The checkpoint format matches huggingface transformers' model format

        :param document_store: An instance of DocumentStore from which to retrieve documents.
        :param max_seq_len: Longest length of each sequence
        :param use_gpu: Whether to use gpu or not
        :param batch_size: Number of questions or passages to encode at once
        :param embed_title: Whether to concatenate title and passage to a text pair that is then used to create the embedding
        :param remove_sep_tok_from_untitled_passages: If embed_title is ``True``, there are different strategies to deal with documents that don't have a title.
        If this param is ``True`` => Embed passage as single text, similar to embed_title = False (i.e [CLS] passage_tok1 ... [SEP]).
        If this param is ``False`` => Embed passage as text pair with empty title (i.e. [CLS] [SEP] passage_tok1 ... [SEP])
        """

        super().__init__(document_store)
        self.document_store = document_store
        self.batch_size = batch_size
        self.max_seq_len = max_seq_len

        if use_gpu and torch.cuda.is_available():
            self.device = torch.device("cuda")
            device_used = "GPU"
        else:
            self.device = torch.device("cpu")
            device_used = "CPU"

        self.embed_title = embed_title
        self.remove_sep_tok_from_untitled_passages = remove_sep_tok_from_untitled_passages

        # Load pretrained retrievers
        binary_dir = './converse/models/orconvqa/pipeline_checkpoint/checkpoint-45000/retriever'
        token_dir = './converse/models/orconvqa/retriever_checkpoint'

        self.query_encoder = AlbertForRetrieverOnlyPositivePassage.from_pretrained(
            binary_dir, force_download=True).to(self.device)
        self.query_tokenizer = AlbertTokenizer.from_pretrained(token_dir)

        self.passage_tokenizer = AlbertTokenizer.from_pretrained(token_dir)
        self.passage_encoder = AlbertForRetrieverOnlyPositivePassage.from_pretrained(
            binary_dir, force_download=True).to(self.device)
        logger.info(
            f"ORConvQARetriever initialised with {type(document_store).__name__} Document Store, torch using {device_used} and model found in location {binary_dir} and tokenizer in location {token_dir}. The batch_size is {batch_size} and the max_seq_len is {max_seq_len}."
        )
def main(args):
    global tokenizer
    if args.tokenizer == "bert":
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    elif args.tokenizer == "albert":
        tokenizer = AlbertTokenizer.from_pretrained('albert-large-v2')
    elif args.tokenizer == "albertx":
        tokenizer = AlbertTokenizer.from_pretrained('albert-xlarge-v2')
    tokenizer.add_tokens(["<POS>"])
    tokenizer.add_tokens(["</POS>"])
    tokenizer.add_tokens(["<NEG>"])
    tokenizer.add_tokens(["</NEG>"])
    data = load_file(args.input)
    with open(args.output, 'wb') as f:
        pickle.dump(data, f)
Beispiel #4
0
def add_transformers_vocab(vocab, tokenizer_name):
    """Add vocabulary from tokenizers in transformers for use with pre-tokenized data.

    These tokenizers have a convert_tokens_to_ids method, but this doesn't do
    anything special, so we can just use the standard indexers.
    """
    do_lower_case = "uncased" in tokenizer_name

    if tokenizer_name.startswith("bert-"):
        tokenizer = BertTokenizer.from_pretrained(tokenizer_name,
                                                  do_lower_case=do_lower_case)
    elif tokenizer_name.startswith("roberta-"):
        tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("albert"):
        tokenizer = AlbertTokenizer(
            vocab_file="/work/dcml0714/albert/albert_base/30k-clean.model")
    elif tokenizer_name.startswith("xlnet-"):
        tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name,
                                                   do_lower_case=do_lower_case)
    elif tokenizer_name.startswith("openai-gpt"):
        tokenizer = OpenAIGPTTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("gpt2"):
        tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("transfo-xl-"):
        tokenizer = TransfoXLTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("xlm-"):
        tokenizer = XLMTokenizer.from_pretrained(tokenizer_name)

    if (tokenizer_name.startswith("openai-gpt")
            or tokenizer_name.startswith("gpt2")
            or tokenizer_name.startswith("transo-xl-")):
        tokenizer.add_special_tokens({
            "bos_token": "<start>",
            "sep_token": "<delim>",
            "cls_token": "<extract>"
        })
    # TODO: this is another place can be simplified by "model-before-preprocess" reorganization
    # we can pass tokenizer created in model here, see issue <TBD>

    vocab_size = len(tokenizer)
    # do not use tokenizer.vocab_size, it does not include newly added token

    ordered_vocab = tokenizer.convert_ids_to_tokens(range(vocab_size))
    log.info("Added transformers vocab (%s): %d tokens", tokenizer_name,
             len(ordered_vocab))
    for word in ordered_vocab:
        vocab.add_token_to_namespace(
            word, input_module_tokenizer_name(tokenizer_name))
    def test_load_weights(self):
        model = get_albert_for_comparison()
        model.load_weights('./.trained_models/albert_weights/ab.ckpt')

        df_test2 = pd.read_csv(
            "../fakenews/data_for_validation/STSbenchmark/data/sts-test-cleaned.csv"
        ).dropna()
        df_test_in = df_test2.iloc[100:110]
        print(df_test_in)
        model_name = 'albert-base-v2'
        max_seq_length = 128

        tokenizer = AlbertTokenizer.from_pretrained(model_name,
                                                    do_lower_case=True,
                                                    add_special_tokens=True,
                                                    max_length=max_seq_length,
                                                    pad_to_max_length=True)
        pred_dataset = STSBenchmarkDataset(
            tokenizer, max_seq_length).from_dataframe(df_test_in,
                                                      training=False)
        print(pred_dataset)
        result = model.predict(pred_dataset.batch(1)) * 5.0
        print(result)
        self.assertAlmostEqual(result[0], 0.058892, delta=0.00001)
        self.assertAlmostEqual(result[2], 4.0518007, delta=0.00001)
        self.assertAlmostEqual(result[-1], 3.647635, delta=0.00001)
        print(result)
 def load_model(self, model_dir: str, model_config: str = "model_config.json"):
     model_config = os.path.join(model_dir,model_config)
     model_config = json.load(open(model_config))
     model = BertNer.from_pretrained(model_dir)
     #tokenizer = BertTokenizer.from_pretrained(model_dir, do_lower_case=model_config["do_lower"])
     tokenizer = AlbertTokenizer.from_pretrained(model_dir, do_lower_case=model_config["do_lower"])
     return model, tokenizer, model_config
    def __init__(self,
                 config,
                 device,
                 random_state_path=".randomstate/random_state.pkl"):
        mkdir("results")
        mkdir("saved")
        mkdir(".randomstate")
        self._RANDOM_STATE_PATH = random_state_path
        if not os.path.isfile(self._RANDOM_STATE_PATH):
            state = random.getstate()
            with open(self._RANDOM_STATE_PATH, "wb") as f:
                pickle.dump(state, f)

        self.config = config
        self.device = device
        self.n_iter = 0
        if config["tokenizer_type"].startswith("bert"):
            self.tokenizer = BertTokenizer.from_pretrained(
                config["tokenizer_type"], cache_dir=config["cache_dir"])

        if config["tokenizer_type"].startswith("roberta"):
            self.tokenizer = RobertaTokenizer.from_pretrained(
                config["tokenizer_type"], cache_dir=config["cache_dir"])

        if config["tokenizer_type"].startswith("albert"):
            self.tokenizer = AlbertTokenizer.from_pretrained(
                config["tokenizer_type"], cache_dir=config["cache_dir"])

        if config["tensorboard_logging"]:
            from torch.utils.tensorboard import SummaryWriter
            self.boardwriter = SummaryWriter()
Beispiel #8
0
    def initialize(self, context):
        """
        Invoke by torchserve for loading a model
        :param context: context contains model server system properties
        :return:
        """
        self.tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2")
        properties = context.system_properties
        self.map_location = 'cpu'
        self.device = torch.device("cpu")
        self.manifest = context.manifest

        model_dir = properties.get("model_dir")
        serialized_file = self.manifest['model']['serializedFile']
        model_pt_path = os.path.join(model_dir, serialized_file)

        if not os.path.isfile(model_pt_path):
            raise RuntimeError("Missing the model.pt file")

        # model def file
        model_file = self.manifest['model'].get('modelFile', '')

        if model_file:
            self.model = self._load_pickled_model(model_dir, model_file, model_pt_path)
        else:
            self.model = self._load_torchscript_model(model_pt_path)

        self.model.to(self.device)
        self.model.eval()
        # Load class mapping for classifiers
        mapping_file_path = os.path.join(model_dir, "index_to_name.json")

        self.initialized = True
    def __init__(self, model='bert'):

        if model.lower() == 'bert':
            self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
            self.max_input_length = self.tokenizer.max_model_input_sizes[
                'bert-base-uncased']
        elif model.lower() == 'albert':
            self.tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
            self.max_input_length = self.tokenizer.max_model_input_sizes[
                'albert-base-v2']
        elif model.lower() == 't5':
            pass

        # these fields are required for dataset.Field and for any masked model

        self.init_token = self.tokenizer.cls_token  # classification token
        self.eos_token = self.tokenizer.sep_token  # separation token
        self.pad_token = self.tokenizer.pad_token  # padding token
        self.unk_token = self.tokenizer.unk_token  # unknown token

        # indexices for the above tokens

        self.init_token_idx = self.tokenizer.cls_token_id
        self.eos_token_idx = self.tokenizer.sep_token_id
        self.pad_token_idx = self.tokenizer.pad_token_id
        self.unk_token_idx = self.tokenizer.unk_token_id
Beispiel #10
0
    def __init__(self):
        ''' PRE-LOAD NECESSARY DATA '''
        # print(os.path.join('models', 'sbert.net_models_distilbert-base-nli-stsb-mean-tokens'))
        # print(os.path.join('models', 'albert_t'))
        # print(os.path.join('models', 'albert_m'))
        # print(os.getcwd())
        self.__sentence_model = SentenceTransformer(
            os.path.join(
                'models',
                'sbert.net_models_distilbert-base-nli-stsb-mean-tokens'))
        self.__tokenizer = AlbertTokenizer.from_pretrained(
            os.path.join('models', 'albert_t'))
        self.__model = AlbertForQuestionAnswering.from_pretrained(
            os.path.join('models', 'albert_m'))

        # Read url file
        with open(os.path.join('data', 'urls.txt'), 'r') as file:
            self.urls = file.read().splitlines()
            file.close()
        with open(os.path.join('data', 'titles.txt'), 'r') as file:
            self.titles = file.read().splitlines()
            file.close()

        # Load pickle files into variables
        names = [
            os.path.join('data', 'punctuated.pkl'),
            os.path.join('data', 'punctuated_embed.pkl'),
            os.path.join('data', 'subs.pkl')
        ]
        self.__punctuateds, self.__sentence_embeddings_p, self.__subs = tuple(
            map(loadPickle, names))
        ''' END OF PRE-LOAD NECESSARY DATA '''
Beispiel #11
0
def get_tokenizer(tokenizer_name):
    log.info(f"\tLoading Tokenizer {tokenizer_name}")
    if tokenizer_name.startswith("bert-"):
        do_lower_case = tokenizer_name.endswith("uncased")
        tokenizer = BertTokenizer.from_pretrained(tokenizer_name,
                                                  do_lower_case=do_lower_case)
    elif tokenizer_name.startswith("roberta-"):
        tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("albert-"):
        tokenizer = AlbertTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("xlnet-"):
        do_lower_case = tokenizer_name.endswith("uncased")
        tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name,
                                                   do_lower_case=do_lower_case)
    elif tokenizer_name.startswith("openai-gpt"):
        tokenizer = OpenAIGPTTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("gpt2"):
        tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("transfo-xl-"):
        # TransformerXL is trained on data pretokenized with MosesTokenizer
        tokenizer = MosesTokenizer()
    elif tokenizer_name.startswith("xlm-"):
        tokenizer = XLMTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name == "MosesTokenizer":
        tokenizer = MosesTokenizer()
    elif tokenizer_name == "SplitChars":
        tokenizer = SplitCharsTokenizer()
    elif tokenizer_name == "":
        tokenizer = SpaceTokenizer()
    else:
        tokenizer = None
    return tokenizer
Beispiel #12
0
def get_tokenizer(lm='bert'):
    """Return the tokenizer. Intiailize it if not initialized.

    Args:
        lm (string): the name of the language model (bert, albert, or distilbert)
    Returns:
        BertTokenizer or DistilBertTokenizer or AlbertTokenizer
    """
    global tokenizer
    if tokenizer is None:
        if lm == 'bert':
            from transformers import BertTokenizer
            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        elif lm == 'distilbert':
            from transformers import DistilBertTokenizer
            tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        elif lm == 'albert':
            from transformers import AlbertTokenizer
            tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
        elif lm == 'roberta':
            from transformers import RobertaTokenizer
            tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        elif lm == 'xlnet':
            from transformers import XLNetTokenizer
            tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
        elif lm == 'longformer':
            from transformers import LongformerTokenizer
            tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
    return tokenizer
def pick_tokenizer(model_name='albert-base-v2'):
    """
        Return specified tokenizer:
        Available model names:
        ['albert-base-v2'\
          , 'bert-base-uncased', 'bert-large-uncased'\
          , 'roberta-base', 'xlnet-base-cased',  ]
    """
    if model_name == 'albert-base-v2':
        tokenizer = AlbertTokenizer.from_pretrained(model_name, do_lower_case=True)
    if model_name == 'bert-base-uncased':
        tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=True)
    if model_name == 'bert-large-uncased':
        tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=True)
    if model_name == 'roberta-base':
        tokenizer = RobertaTokenizer.from_pretrained(model_name, do_lower_case=True)
    if model_name == 'roberta-large':
        tokenizer = RobertaTokenizer.from_pretrained(model_name, do_lower_case=True)
    if model_name == 'roberta-large-mnli':
        tokenizer = RobertaTokenizer.from_pretrained(model_name, do_lower_case=True)
    if model_name == 'xlnet-base-cased':
        tokenizer = XLNetTokenizer.from_pretrained(model_name, do_lower_case=True)

    print(f'Loaded {model_name} tokenizer.')
    return tokenizer
def create_tokenizer(model_type: str) -> PreTrainedTokenizer:
    if model_type == "albert":
        return AlbertTokenizer.from_pretrained("albert-base-v2")
    elif model_type == "bert":
        return BertTokenizer.from_pretrained("bert-base-uncased")
    else:
        raise ValueError(f"model_type={model_type} must be one of ['albert', 'bert']")
Beispiel #15
0
 def __init__(self, albert_config):
     self.albert_config = albert_config
     model_name = self.albert_config['model_name']
     self.tokenizer = AlbertTokenizer.from_pretrained(model_name)
     self.model = TFAlbertModel.from_pretrained(model_name)
     self.summary_extraction_mode = self.albert_config[
         'summary_extraction_mode']
Beispiel #16
0
def get_tokenizer(model):
    if model == "bert-base-uncased":
        return BertTokenizer.from_pretrained(model)
    elif model == "xlnet-base-cased":
        return XLNetTokenizer.from_pretrained(model)
    elif model == "albert-base-v2":
        return AlbertTokenizer.from_pretrained(model)
def create_data_loaders(data_loader_cfg,
                        tokenizer_cfg,
                        dataset_paths,
                        is_train=True):
    '''
    dataset_paths: can be either a single path to a dataset or a dictionary of paths
    where the keys specify the model, e.g.
        {
            'train': './data/train.pkl',
            'dev': './data/dev.pkl'
        }.
    '''
    tokenizer = AlbertTokenizer.from_pretrained(**tokenizer_cfg)

    print('[*] Loading datasets')
    datasets = load_datasets(dataset_paths,
                             tokenizer.max_len,
                             is_train=is_train)

    print('\n[*] Creating data loaders')
    if type(datasets) is dict:
        data_loaders = {
            k: create_data_loader(dataset, tokenizer, **data_loader_cfg)
            for k, dataset in datasets.items()
        }
    else:
        data_loaders = create_data_loader(datasets, tokenizer,
                                          **data_loader_cfg)

    return data_loaders
Beispiel #18
0
    def __init__(
            self,
            lang: str = 'en',
            ):
        try:
            from transformers import BertJapaneseTokenizer, AlbertTokenizer, CamembertTokenizer, AutoTokenizer
            from transformers import AlbertModel, CamembertModel, AutoModel
        except ImportError:
            msg = "importing bert dep failed."
            msg += "\n try to install sister by `pip install sister[bert]`."
            raise ImportError(msg)

        if lang == "en":
            tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2")
            model = AlbertModel.from_pretrained("albert-base-v2")
        elif lang == "fr":
            tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
            model = CamembertModel.from_pretrained("camembert-base")
        elif lang == "es":
            tokenizer = AutoTokenizer.from_pretrained("dccuchile/bert-base-spanish-wwm-uncased")
            model = AutoModel.from_pretrained("dccuchile/bert-base-spanish-wwm-uncased")
        elif lang == "ja":
            tokenizer = BertJapaneseTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-whole-word-masking")
            model = BertModel.from_pretrained("cl-tohoku/bert-base-japanese-whole-word-masking")

        self.tokenizer = tokenizer
        self.model = model
Beispiel #19
0
    def __init__(self, args, num_ner_labels):
        super().__init__()

        bert_model_name = args.model
        vocab_name = bert_model_name

        if args.bert_model_dir is not None:
            bert_model_name = str(args.bert_model_dir) + '/'
            # vocab_name = bert_model_name + 'vocab.txt'
            vocab_name = bert_model_name
            logger.info('Loading BERT model from {}'.format(bert_model_name))

        if args.use_albert:
            self.tokenizer = AlbertTokenizer.from_pretrained(vocab_name)
            self.bert_model = AlbertForEntity.from_pretrained(
                bert_model_name,
                num_ner_labels=num_ner_labels,
                max_span_length=args.max_span_length)
        else:
            self.tokenizer = BertTokenizer.from_pretrained(vocab_name)
            self.bert_model = BertForEntity.from_pretrained(
                bert_model_name,
                num_ner_labels=num_ner_labels,
                max_span_length=args.max_span_length)

        self._model_device = 'cpu'
        self.move_model_to_cuda()
    def __init__(self, args, device='cpu'):
        # self.tokenizer = BertTokenizer.from_pretrained(args.bert_model)
        self.tokenizer = AlbertTokenizer.from_pretrained(args.bert_model)
        self.data_dir = args.data_dir
        file_list = get_json_file_list(args.data_dir)
        self.data = []
        #max_article_len = 0
        for file_name in file_list:
            data = json.loads(open(file_name, 'r').read())
            data['high'] = 0
            if ('high' in file_name):
                data['high'] = 1
            self.data.append(data)

        self.data_objs = []
        high_cnt = 0
        middle_cnt = 0
        for sample in self.data:
            high_cnt += sample['high']
            middle_cnt += (1 - sample['high'])
            self.data_objs += self._create_sample(sample)

        print('high school sample:', high_cnt)
        print('middle school sample:', middle_cnt)
        for i in range(len(self.data_objs)):
            self.data_objs[i].convert_tokens_to_ids(self.tokenizer)
            #break
        torch.save(self.data_objs, args.save_name)
Beispiel #21
0
 def __init__(self):
     super().__init__()
     self.bert = AlbertModel.from_pretrained('albert-base-v2')
     self.tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
     self.score_fc = nn.Linear(768, 11)
     self.regression_fc = nn.Linear(768, 1)
     self.sigmoid = nn.Sigmoid()
Beispiel #22
0
def load_and_predict(data_dir, model_type, pretrain_model):
    if model_type == 'bert_japanese':
        model = BertForQuestionAnswering.from_pretrained(
            'cl-tohoku/bert-base-japanese')
        tokenizer = BertJapaneseTokenizer.from_pretrained(
            'cl-tohoku/bert-base-japanese')

    if model_type == 'bert_multilingual':
        model = BertForQuestionAnswering.from_pretrained(
            'bert-base-multilingual-cased')
        tokenizer = BertTokenizer.from_pretrained(
            'bert-base-multilingual-cased', tokenize_chinese_chars=False)

    if model_type == 'albert':
        model = AlbertForQuestionAnswering.from_pretrained(
            'ALINEAR/albert-japanese-v2')
        tokenizer = AlbertTokenizer.from_pretrained(
            'ALINEAR/albert-japanese-v2')

    test_data = TestData(data_dir, TAG)
    testset = QADataset(test_data.examples, "test", tokenizer=tokenizer)
    testloader = DataLoader(testset, batch_size=4, collate_fn=collate_fn)

    model = model.to(device)
    model.load_state_dict(torch.load(pretrain_model))

    prediction = predict(model, testloader, device, tokenizer)
    prediction = func(data_dir, prediction)
    print('finish loading and predicting from {}!'.format(pretrain_model))
    return prediction  #prediction dictionary
Beispiel #23
0
def build(args):
    TAG = create_tags()
    XLSX_PATH = {'train': 'release/train/ca_data', 'dev': 'release/dev/ca_data', 'test': 'release/test/ca_data'}
    
    PRETRAINED_MODEL_NAME = 'ALINEAR/albert-japanese-v2'
    tokenizer = AlbertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)
    
    train_data = TrainData(XLSX_PATH['train'], TAG, only_positive=args.only_positive)

    trainset = QADataset(train_data.examples, "train", tokenizer=tokenizer)
    trainloader = DataLoader(trainset, batch_size=args.batch_size, collate_fn=collate_fn)
    
    dev_data = TrainData(XLSX_PATH['dev'], TAG, only_positive=args.only_positive)
    
    devset = QADataset(dev_data.examples, "train", tokenizer=tokenizer)
    devloader = DataLoader(devset, batch_size=args.batch_size, collate_fn=collate_fn)

    logger.info(f"[train data] {train_data.summary()}")
    logger.info(f"[dev data] {dev_data.summary()}")
    
    test_data = TestData(XLSX_PATH['dev'], TAG)
    testset = QADataset(test_data.examples, "test", tokenizer=tokenizer)
    testloader = DataLoader(testset, batch_size=args.batch_size, collate_fn=collate_fn)
    
    model = AlbertForQuestionAnswering.from_pretrained(PRETRAINED_MODEL_NAME)
    model = model.to(args.device)

    if args.load_pretrained_model:
        model.load_state_dict(torch.load(args.pretrained_model_path))
    
    return model, trainloader, devloader, testloader, tokenizer
Beispiel #24
0
    def __init__(
        self,
        lang: str = "en",
    ):
        try:
            from transformers import (AlbertModel, AlbertTokenizer, BertConfig,
                                      BertJapaneseTokenizer, BertModel,
                                      CamembertModel, CamembertTokenizer)
        except ImportError:
            msg = "importing bert dep failed."
            msg += "\n try to install sister by `pip install sister[bert]`."
            raise ImportError(msg)

        if lang == "en":
            model_name = "albert-base-v2"
            tokenizer = AlbertTokenizer.from_pretrained(model_name)
            config = BertConfig.from_pretrained(model_name,
                                                output_hidden_states=True)
            model = AlbertModel.from_pretrained(model_name, config=config)
        elif lang == "fr":
            model_name = "camembert-base"
            tokenizer = CamembertTokenizer.from_pretrained(model_name)
            config = BertConfig.from_pretrained(model_name,
                                                output_hidden_states=True)
            model = CamembertModel.from_pretrained(model_name, config=config)
        elif lang == "ja":
            model_name = "cl-tohoku/bert-base-japanese-whole-word-masking"
            tokenizer = BertJapaneseTokenizer.from_pretrained(model_name)
            config = BertConfig.from_pretrained(model_name,
                                                output_hidden_states=True)
            model = BertModel.from_pretrained(model_name, config=config)

        self.tokenizer = tokenizer
        self.model = model
Beispiel #25
0
def load_transformer(model_type):
    if model_type == "distilbert":
        tokenizer = DistilBertTokenizer.from_pretrained(
            'distilbert-base-uncased')
        model = TFDistilBertForSequenceClassification.from_pretrained(
            "distilbert-base-uncased", num_labels=1)
    elif model_type == "bert_x12":
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = TFBertForSequenceClassification.from_pretrained(
            "bert-base-uncased", num_labels=1)
    elif model_type == "bert_x24":
        tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
        model = TFBertForSequenceClassification.from_pretrained(
            "bert-large-uncased", num_labels=1)
    elif model_type == "albert_v2_x12":
        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
        model = TFAlbertForSequenceClassification.from_pretrained(
            "albert-base-v2", num_labels=1)
    elif model_type == "longformer_x12":
        tokenizer = LongformerTokenizer.from_pretrained(
            'allenai/longformer-base-4096')
        model = TFLongformerForSequenceClassification.from_pretrained(
            "allenai/longformer-base-4096", num_labels=1)
    elif model_type == "longformer_x24":
        tokenizer = LongformerTokenizer.from_pretrained(
            'allenai/longformer-large-4096')
        model = TFLongformerForSequenceClassification.from_pretrained(
            "allenai/longformer-large-4096", num_labels=1)
    else:
        raise ValueError(model_type + " was invalid")

    return model, tokenizer
def load_test_data(file_path, batch_size=32, tsv=False):

    df = pd.read_csv(file_path)
    sentences = df.h.values
    raw_labels = df.label.values

    tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
    encoding = tokenizer(list(sentences),
                         return_tensors='pt',
                         padding=True,
                         truncation=True,
                         max_length=128).to(device)

    input_ids = encoding['input_ids']
    attention_masks = encoding['attention_mask']
    labels = torch.tensor(raw_labels).unsqueeze(1).to(device)
    test_dataset = TensorDataset(input_ids, attention_masks, labels)

    # Dataloading
    prediction_sampler = SequentialSampler(test_dataset)
    test_dataloader = DataLoader(test_dataset,
                                 sampler=prediction_sampler,
                                 batch_size=batch_size)

    return (test_dataloader, sentences, raw_labels)
def build_ban(dataset, num_hid, op='', gamma=4, task='vqa', use_counter=True):
    #w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, .0, op)
    #q_emb = QuestionEmbedding(300 if 'c' not in op else 600, num_hid, 1, False, .0)
    w_emb = AlbertTokenizer.from_pretrained('albert-large-v2')
    q_emb = AlbertModel.from_pretrained('albert-large-v2')
    params_set = set()
    for param in q_emb.parameters():
        params_set.add(param)
        param.requires_grad = False
    v_att = BiAttention(dataset.v_dim, num_hid, num_hid, gamma)
    if task == 'vqa':
        b_net = []
        q_prj = []
        c_prj = []
        objects = 10  # minimum number of boxes
        for i in range(gamma):
            b_net.append(BCNet(dataset.v_dim, num_hid, num_hid, None, k=1))
            q_prj.append(FCNet([num_hid, num_hid], '', .2))
            c_prj.append(FCNet([objects + 1, num_hid], 'ReLU', .0))
        classifier = SimpleClassifier(num_hid, num_hid * 2,
                                      dataset.num_ans_candidates, .5)
        counter = Counter(objects) if use_counter else None
        return BanModel(dataset, params_set, w_emb, q_emb, v_att, b_net, q_prj,
                        c_prj, classifier, counter, op, gamma)
    elif task == 'flickr':
        return BanModel_flickr(w_emb, q_emb, v_att, op, gamma)
Beispiel #28
0
    def load_tokenizer(cls, transformer: str):
        """
        Loads the tokenizer based on th given transformer name.

        Args:
            transformer: Name of huggingface transformer
        """
        tokenizer_path = "data/{0}_tokenizer.pkl".format(transformer)
        if os.path.isfile(tokenizer_path):
            logger.info("Loading tokenizer from saved path.")
            with open(tokenizer_path, "rb") as pkl_file:
                return joblib.load(pkl_file)
        elif "albert" in transformer:
            tokenizer = AlbertTokenizer.from_pretrained(transformer,
                                                        do_lower_case=False)
        else:
            tokenizer = BertTokenizer.from_pretrained(transformer,
                                                      do_lower_case=False,
                                                      add_special_tokens=True)
        tokenizer.add_tokens(["[E1]", "[/E1]", "[E2]", "[/E2]", "[BLANK]"])
        with open(tokenizer_path, "wb") as output:
            joblib.dump(tokenizer, output)

        logger.info("Saved {0} tokenizer at {1}".format(
            transformer, tokenizer_path))
        return tokenizer
Beispiel #29
0
    def _albert(self):
        from transformers import AlbertTokenizer  # noqa pylint: disable=import-outside-toplevel

        self.lm_model_tokenizer = AlbertTokenizer.from_pretrained(
            'albert-base-v2')
        self.lm_padding_value = self.lm_model_tokenizer._convert_token_to_id(
            '<pad>')
        space_value = self.lm_model_tokenizer._convert_token_to_id('▁')

        self.id2lm_tokens = {}
        for i, d in enumerate(self.data):
            normalized_text = d["normalized_text"]

            assert isinstance(self.text_tokenizer,
                              EnglishPhonemesTokenizer) or isinstance(
                                  self.text_tokenizer, EnglishCharsTokenizer)
            preprocess_text_as_tts_input = self.text_tokenizer.text_preprocessing_func(
                normalized_text)

            lm_tokens_as_ids = self.lm_model_tokenizer.encode(
                preprocess_text_as_tts_input, add_special_tokens=False)

            if self.text_tokenizer.pad_with_space:
                lm_tokens_as_ids = [space_value
                                    ] + lm_tokens_as_ids + [space_value]

            self.id2lm_tokens[i] = lm_tokens_as_ids
Beispiel #30
0
def load_data(file_path, oversample=False):
    df = pd.read_csv(file_path)

    sentences = df.sentences.values
    labels = df.labels.values

    tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
    idxs = []
    for i in range(len(sentences)):
        if pd.isnull(sentences[i]):
            idxs.append(i)
    sentences = np.delete(sentences, idxs)
    labels = np.delete(labels, idxs)

    encoding = tokenizer(list(sentences),
                         return_tensors='pt',
                         padding=True,
                         truncation=True,
                         max_length=128).to(device)

    input_ids = encoding['input_ids']
    attention_masks = encoding['attention_mask']
    labels = torch.tensor(labels).unsqueeze(1).to(device)

    return input_ids, attention_masks, labels