def get_tokenizer(self): if self.hparams.model_type == 'bert': tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') elif self.hparams.model_type == 'bert-cased': tokenizer = BertTokenizer.from_pretrained('bert-base-cased') elif self.hparams.model_type == 'bert-large': tokenizer = BertTokenizer.from_pretrained('bert-large-uncased') elif self.hparams.model_type == 'distilbert': tokenizer = DistilBertTokenizer.from_pretrained( 'distilbert-base-uncased') elif self.hparams.model_type == 'roberta': tokenizer = RobertaTokenizer.from_pretrained('roberta-base') elif self.hparams.model_type == 'roberta-large': tokenizer = RobertaTokenizer.from_pretrained('roberta-large') elif self.hparams.model_type == 'albert': tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') elif self.hparams.model_type == 'albert-xxlarge': tokenizer = AlbertTokenizer.from_pretrained('albert-xxlarge-v2') elif self.hparams.model_type == 'electra': tokenizer = ElectraTokenizer.from_pretrained( 'google/electra-base-discriminator') elif self.hparams.model_type == 'electra-large': tokenizer = ElectraTokenizer.from_pretrained( 'google/electra-large-discriminator') else: raise ValueError return tokenizer
def __init__(self, document_store: BaseDocumentStore, max_seq_len: int = 256, use_gpu: bool = True, batch_size: int = 16, embed_title: bool = True, remove_sep_tok_from_untitled_passages: bool = True): """ Init the Retriever incl. the two encoder models from a local or remote model checkpoint. The checkpoint format matches huggingface transformers' model format :param document_store: An instance of DocumentStore from which to retrieve documents. :param max_seq_len: Longest length of each sequence :param use_gpu: Whether to use gpu or not :param batch_size: Number of questions or passages to encode at once :param embed_title: Whether to concatenate title and passage to a text pair that is then used to create the embedding :param remove_sep_tok_from_untitled_passages: If embed_title is ``True``, there are different strategies to deal with documents that don't have a title. If this param is ``True`` => Embed passage as single text, similar to embed_title = False (i.e [CLS] passage_tok1 ... [SEP]). If this param is ``False`` => Embed passage as text pair with empty title (i.e. [CLS] [SEP] passage_tok1 ... [SEP]) """ super().__init__(document_store) self.document_store = document_store self.batch_size = batch_size self.max_seq_len = max_seq_len if use_gpu and torch.cuda.is_available(): self.device = torch.device("cuda") device_used = "GPU" else: self.device = torch.device("cpu") device_used = "CPU" self.embed_title = embed_title self.remove_sep_tok_from_untitled_passages = remove_sep_tok_from_untitled_passages # Load pretrained retrievers binary_dir = './converse/models/orconvqa/pipeline_checkpoint/checkpoint-45000/retriever' token_dir = './converse/models/orconvqa/retriever_checkpoint' self.query_encoder = AlbertForRetrieverOnlyPositivePassage.from_pretrained( binary_dir, force_download=True).to(self.device) self.query_tokenizer = AlbertTokenizer.from_pretrained(token_dir) self.passage_tokenizer = AlbertTokenizer.from_pretrained(token_dir) self.passage_encoder = AlbertForRetrieverOnlyPositivePassage.from_pretrained( binary_dir, force_download=True).to(self.device) logger.info( f"ORConvQARetriever initialised with {type(document_store).__name__} Document Store, torch using {device_used} and model found in location {binary_dir} and tokenizer in location {token_dir}. The batch_size is {batch_size} and the max_seq_len is {max_seq_len}." )
def main(args): global tokenizer if args.tokenizer == "bert": tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') elif args.tokenizer == "albert": tokenizer = AlbertTokenizer.from_pretrained('albert-large-v2') elif args.tokenizer == "albertx": tokenizer = AlbertTokenizer.from_pretrained('albert-xlarge-v2') tokenizer.add_tokens(["<POS>"]) tokenizer.add_tokens(["</POS>"]) tokenizer.add_tokens(["<NEG>"]) tokenizer.add_tokens(["</NEG>"]) data = load_file(args.input) with open(args.output, 'wb') as f: pickle.dump(data, f)
def add_transformers_vocab(vocab, tokenizer_name): """Add vocabulary from tokenizers in transformers for use with pre-tokenized data. These tokenizers have a convert_tokens_to_ids method, but this doesn't do anything special, so we can just use the standard indexers. """ do_lower_case = "uncased" in tokenizer_name if tokenizer_name.startswith("bert-"): tokenizer = BertTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case) elif tokenizer_name.startswith("roberta-"): tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("albert"): tokenizer = AlbertTokenizer( vocab_file="/work/dcml0714/albert/albert_base/30k-clean.model") elif tokenizer_name.startswith("xlnet-"): tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case) elif tokenizer_name.startswith("openai-gpt"): tokenizer = OpenAIGPTTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("gpt2"): tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("transfo-xl-"): tokenizer = TransfoXLTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("xlm-"): tokenizer = XLMTokenizer.from_pretrained(tokenizer_name) if (tokenizer_name.startswith("openai-gpt") or tokenizer_name.startswith("gpt2") or tokenizer_name.startswith("transo-xl-")): tokenizer.add_special_tokens({ "bos_token": "<start>", "sep_token": "<delim>", "cls_token": "<extract>" }) # TODO: this is another place can be simplified by "model-before-preprocess" reorganization # we can pass tokenizer created in model here, see issue <TBD> vocab_size = len(tokenizer) # do not use tokenizer.vocab_size, it does not include newly added token ordered_vocab = tokenizer.convert_ids_to_tokens(range(vocab_size)) log.info("Added transformers vocab (%s): %d tokens", tokenizer_name, len(ordered_vocab)) for word in ordered_vocab: vocab.add_token_to_namespace( word, input_module_tokenizer_name(tokenizer_name))
def test_load_weights(self): model = get_albert_for_comparison() model.load_weights('./.trained_models/albert_weights/ab.ckpt') df_test2 = pd.read_csv( "../fakenews/data_for_validation/STSbenchmark/data/sts-test-cleaned.csv" ).dropna() df_test_in = df_test2.iloc[100:110] print(df_test_in) model_name = 'albert-base-v2' max_seq_length = 128 tokenizer = AlbertTokenizer.from_pretrained(model_name, do_lower_case=True, add_special_tokens=True, max_length=max_seq_length, pad_to_max_length=True) pred_dataset = STSBenchmarkDataset( tokenizer, max_seq_length).from_dataframe(df_test_in, training=False) print(pred_dataset) result = model.predict(pred_dataset.batch(1)) * 5.0 print(result) self.assertAlmostEqual(result[0], 0.058892, delta=0.00001) self.assertAlmostEqual(result[2], 4.0518007, delta=0.00001) self.assertAlmostEqual(result[-1], 3.647635, delta=0.00001) print(result)
def load_model(self, model_dir: str, model_config: str = "model_config.json"): model_config = os.path.join(model_dir,model_config) model_config = json.load(open(model_config)) model = BertNer.from_pretrained(model_dir) #tokenizer = BertTokenizer.from_pretrained(model_dir, do_lower_case=model_config["do_lower"]) tokenizer = AlbertTokenizer.from_pretrained(model_dir, do_lower_case=model_config["do_lower"]) return model, tokenizer, model_config
def __init__(self, config, device, random_state_path=".randomstate/random_state.pkl"): mkdir("results") mkdir("saved") mkdir(".randomstate") self._RANDOM_STATE_PATH = random_state_path if not os.path.isfile(self._RANDOM_STATE_PATH): state = random.getstate() with open(self._RANDOM_STATE_PATH, "wb") as f: pickle.dump(state, f) self.config = config self.device = device self.n_iter = 0 if config["tokenizer_type"].startswith("bert"): self.tokenizer = BertTokenizer.from_pretrained( config["tokenizer_type"], cache_dir=config["cache_dir"]) if config["tokenizer_type"].startswith("roberta"): self.tokenizer = RobertaTokenizer.from_pretrained( config["tokenizer_type"], cache_dir=config["cache_dir"]) if config["tokenizer_type"].startswith("albert"): self.tokenizer = AlbertTokenizer.from_pretrained( config["tokenizer_type"], cache_dir=config["cache_dir"]) if config["tensorboard_logging"]: from torch.utils.tensorboard import SummaryWriter self.boardwriter = SummaryWriter()
def initialize(self, context): """ Invoke by torchserve for loading a model :param context: context contains model server system properties :return: """ self.tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2") properties = context.system_properties self.map_location = 'cpu' self.device = torch.device("cpu") self.manifest = context.manifest model_dir = properties.get("model_dir") serialized_file = self.manifest['model']['serializedFile'] model_pt_path = os.path.join(model_dir, serialized_file) if not os.path.isfile(model_pt_path): raise RuntimeError("Missing the model.pt file") # model def file model_file = self.manifest['model'].get('modelFile', '') if model_file: self.model = self._load_pickled_model(model_dir, model_file, model_pt_path) else: self.model = self._load_torchscript_model(model_pt_path) self.model.to(self.device) self.model.eval() # Load class mapping for classifiers mapping_file_path = os.path.join(model_dir, "index_to_name.json") self.initialized = True
def __init__(self, model='bert'): if model.lower() == 'bert': self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') self.max_input_length = self.tokenizer.max_model_input_sizes[ 'bert-base-uncased'] elif model.lower() == 'albert': self.tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') self.max_input_length = self.tokenizer.max_model_input_sizes[ 'albert-base-v2'] elif model.lower() == 't5': pass # these fields are required for dataset.Field and for any masked model self.init_token = self.tokenizer.cls_token # classification token self.eos_token = self.tokenizer.sep_token # separation token self.pad_token = self.tokenizer.pad_token # padding token self.unk_token = self.tokenizer.unk_token # unknown token # indexices for the above tokens self.init_token_idx = self.tokenizer.cls_token_id self.eos_token_idx = self.tokenizer.sep_token_id self.pad_token_idx = self.tokenizer.pad_token_id self.unk_token_idx = self.tokenizer.unk_token_id
def __init__(self): ''' PRE-LOAD NECESSARY DATA ''' # print(os.path.join('models', 'sbert.net_models_distilbert-base-nli-stsb-mean-tokens')) # print(os.path.join('models', 'albert_t')) # print(os.path.join('models', 'albert_m')) # print(os.getcwd()) self.__sentence_model = SentenceTransformer( os.path.join( 'models', 'sbert.net_models_distilbert-base-nli-stsb-mean-tokens')) self.__tokenizer = AlbertTokenizer.from_pretrained( os.path.join('models', 'albert_t')) self.__model = AlbertForQuestionAnswering.from_pretrained( os.path.join('models', 'albert_m')) # Read url file with open(os.path.join('data', 'urls.txt'), 'r') as file: self.urls = file.read().splitlines() file.close() with open(os.path.join('data', 'titles.txt'), 'r') as file: self.titles = file.read().splitlines() file.close() # Load pickle files into variables names = [ os.path.join('data', 'punctuated.pkl'), os.path.join('data', 'punctuated_embed.pkl'), os.path.join('data', 'subs.pkl') ] self.__punctuateds, self.__sentence_embeddings_p, self.__subs = tuple( map(loadPickle, names)) ''' END OF PRE-LOAD NECESSARY DATA '''
def get_tokenizer(tokenizer_name): log.info(f"\tLoading Tokenizer {tokenizer_name}") if tokenizer_name.startswith("bert-"): do_lower_case = tokenizer_name.endswith("uncased") tokenizer = BertTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case) elif tokenizer_name.startswith("roberta-"): tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("albert-"): tokenizer = AlbertTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("xlnet-"): do_lower_case = tokenizer_name.endswith("uncased") tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case) elif tokenizer_name.startswith("openai-gpt"): tokenizer = OpenAIGPTTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("gpt2"): tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("transfo-xl-"): # TransformerXL is trained on data pretokenized with MosesTokenizer tokenizer = MosesTokenizer() elif tokenizer_name.startswith("xlm-"): tokenizer = XLMTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name == "MosesTokenizer": tokenizer = MosesTokenizer() elif tokenizer_name == "SplitChars": tokenizer = SplitCharsTokenizer() elif tokenizer_name == "": tokenizer = SpaceTokenizer() else: tokenizer = None return tokenizer
def get_tokenizer(lm='bert'): """Return the tokenizer. Intiailize it if not initialized. Args: lm (string): the name of the language model (bert, albert, or distilbert) Returns: BertTokenizer or DistilBertTokenizer or AlbertTokenizer """ global tokenizer if tokenizer is None: if lm == 'bert': from transformers import BertTokenizer tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') elif lm == 'distilbert': from transformers import DistilBertTokenizer tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') elif lm == 'albert': from transformers import AlbertTokenizer tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') elif lm == 'roberta': from transformers import RobertaTokenizer tokenizer = RobertaTokenizer.from_pretrained('roberta-base') elif lm == 'xlnet': from transformers import XLNetTokenizer tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased') elif lm == 'longformer': from transformers import LongformerTokenizer tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096') return tokenizer
def pick_tokenizer(model_name='albert-base-v2'): """ Return specified tokenizer: Available model names: ['albert-base-v2'\ , 'bert-base-uncased', 'bert-large-uncased'\ , 'roberta-base', 'xlnet-base-cased', ] """ if model_name == 'albert-base-v2': tokenizer = AlbertTokenizer.from_pretrained(model_name, do_lower_case=True) if model_name == 'bert-base-uncased': tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=True) if model_name == 'bert-large-uncased': tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=True) if model_name == 'roberta-base': tokenizer = RobertaTokenizer.from_pretrained(model_name, do_lower_case=True) if model_name == 'roberta-large': tokenizer = RobertaTokenizer.from_pretrained(model_name, do_lower_case=True) if model_name == 'roberta-large-mnli': tokenizer = RobertaTokenizer.from_pretrained(model_name, do_lower_case=True) if model_name == 'xlnet-base-cased': tokenizer = XLNetTokenizer.from_pretrained(model_name, do_lower_case=True) print(f'Loaded {model_name} tokenizer.') return tokenizer
def create_tokenizer(model_type: str) -> PreTrainedTokenizer: if model_type == "albert": return AlbertTokenizer.from_pretrained("albert-base-v2") elif model_type == "bert": return BertTokenizer.from_pretrained("bert-base-uncased") else: raise ValueError(f"model_type={model_type} must be one of ['albert', 'bert']")
def __init__(self, albert_config): self.albert_config = albert_config model_name = self.albert_config['model_name'] self.tokenizer = AlbertTokenizer.from_pretrained(model_name) self.model = TFAlbertModel.from_pretrained(model_name) self.summary_extraction_mode = self.albert_config[ 'summary_extraction_mode']
def get_tokenizer(model): if model == "bert-base-uncased": return BertTokenizer.from_pretrained(model) elif model == "xlnet-base-cased": return XLNetTokenizer.from_pretrained(model) elif model == "albert-base-v2": return AlbertTokenizer.from_pretrained(model)
def create_data_loaders(data_loader_cfg, tokenizer_cfg, dataset_paths, is_train=True): ''' dataset_paths: can be either a single path to a dataset or a dictionary of paths where the keys specify the model, e.g. { 'train': './data/train.pkl', 'dev': './data/dev.pkl' }. ''' tokenizer = AlbertTokenizer.from_pretrained(**tokenizer_cfg) print('[*] Loading datasets') datasets = load_datasets(dataset_paths, tokenizer.max_len, is_train=is_train) print('\n[*] Creating data loaders') if type(datasets) is dict: data_loaders = { k: create_data_loader(dataset, tokenizer, **data_loader_cfg) for k, dataset in datasets.items() } else: data_loaders = create_data_loader(datasets, tokenizer, **data_loader_cfg) return data_loaders
def __init__( self, lang: str = 'en', ): try: from transformers import BertJapaneseTokenizer, AlbertTokenizer, CamembertTokenizer, AutoTokenizer from transformers import AlbertModel, CamembertModel, AutoModel except ImportError: msg = "importing bert dep failed." msg += "\n try to install sister by `pip install sister[bert]`." raise ImportError(msg) if lang == "en": tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2") model = AlbertModel.from_pretrained("albert-base-v2") elif lang == "fr": tokenizer = CamembertTokenizer.from_pretrained("camembert-base") model = CamembertModel.from_pretrained("camembert-base") elif lang == "es": tokenizer = AutoTokenizer.from_pretrained("dccuchile/bert-base-spanish-wwm-uncased") model = AutoModel.from_pretrained("dccuchile/bert-base-spanish-wwm-uncased") elif lang == "ja": tokenizer = BertJapaneseTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-whole-word-masking") model = BertModel.from_pretrained("cl-tohoku/bert-base-japanese-whole-word-masking") self.tokenizer = tokenizer self.model = model
def __init__(self, args, num_ner_labels): super().__init__() bert_model_name = args.model vocab_name = bert_model_name if args.bert_model_dir is not None: bert_model_name = str(args.bert_model_dir) + '/' # vocab_name = bert_model_name + 'vocab.txt' vocab_name = bert_model_name logger.info('Loading BERT model from {}'.format(bert_model_name)) if args.use_albert: self.tokenizer = AlbertTokenizer.from_pretrained(vocab_name) self.bert_model = AlbertForEntity.from_pretrained( bert_model_name, num_ner_labels=num_ner_labels, max_span_length=args.max_span_length) else: self.tokenizer = BertTokenizer.from_pretrained(vocab_name) self.bert_model = BertForEntity.from_pretrained( bert_model_name, num_ner_labels=num_ner_labels, max_span_length=args.max_span_length) self._model_device = 'cpu' self.move_model_to_cuda()
def __init__(self, args, device='cpu'): # self.tokenizer = BertTokenizer.from_pretrained(args.bert_model) self.tokenizer = AlbertTokenizer.from_pretrained(args.bert_model) self.data_dir = args.data_dir file_list = get_json_file_list(args.data_dir) self.data = [] #max_article_len = 0 for file_name in file_list: data = json.loads(open(file_name, 'r').read()) data['high'] = 0 if ('high' in file_name): data['high'] = 1 self.data.append(data) self.data_objs = [] high_cnt = 0 middle_cnt = 0 for sample in self.data: high_cnt += sample['high'] middle_cnt += (1 - sample['high']) self.data_objs += self._create_sample(sample) print('high school sample:', high_cnt) print('middle school sample:', middle_cnt) for i in range(len(self.data_objs)): self.data_objs[i].convert_tokens_to_ids(self.tokenizer) #break torch.save(self.data_objs, args.save_name)
def __init__(self): super().__init__() self.bert = AlbertModel.from_pretrained('albert-base-v2') self.tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') self.score_fc = nn.Linear(768, 11) self.regression_fc = nn.Linear(768, 1) self.sigmoid = nn.Sigmoid()
def load_and_predict(data_dir, model_type, pretrain_model): if model_type == 'bert_japanese': model = BertForQuestionAnswering.from_pretrained( 'cl-tohoku/bert-base-japanese') tokenizer = BertJapaneseTokenizer.from_pretrained( 'cl-tohoku/bert-base-japanese') if model_type == 'bert_multilingual': model = BertForQuestionAnswering.from_pretrained( 'bert-base-multilingual-cased') tokenizer = BertTokenizer.from_pretrained( 'bert-base-multilingual-cased', tokenize_chinese_chars=False) if model_type == 'albert': model = AlbertForQuestionAnswering.from_pretrained( 'ALINEAR/albert-japanese-v2') tokenizer = AlbertTokenizer.from_pretrained( 'ALINEAR/albert-japanese-v2') test_data = TestData(data_dir, TAG) testset = QADataset(test_data.examples, "test", tokenizer=tokenizer) testloader = DataLoader(testset, batch_size=4, collate_fn=collate_fn) model = model.to(device) model.load_state_dict(torch.load(pretrain_model)) prediction = predict(model, testloader, device, tokenizer) prediction = func(data_dir, prediction) print('finish loading and predicting from {}!'.format(pretrain_model)) return prediction #prediction dictionary
def build(args): TAG = create_tags() XLSX_PATH = {'train': 'release/train/ca_data', 'dev': 'release/dev/ca_data', 'test': 'release/test/ca_data'} PRETRAINED_MODEL_NAME = 'ALINEAR/albert-japanese-v2' tokenizer = AlbertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME) train_data = TrainData(XLSX_PATH['train'], TAG, only_positive=args.only_positive) trainset = QADataset(train_data.examples, "train", tokenizer=tokenizer) trainloader = DataLoader(trainset, batch_size=args.batch_size, collate_fn=collate_fn) dev_data = TrainData(XLSX_PATH['dev'], TAG, only_positive=args.only_positive) devset = QADataset(dev_data.examples, "train", tokenizer=tokenizer) devloader = DataLoader(devset, batch_size=args.batch_size, collate_fn=collate_fn) logger.info(f"[train data] {train_data.summary()}") logger.info(f"[dev data] {dev_data.summary()}") test_data = TestData(XLSX_PATH['dev'], TAG) testset = QADataset(test_data.examples, "test", tokenizer=tokenizer) testloader = DataLoader(testset, batch_size=args.batch_size, collate_fn=collate_fn) model = AlbertForQuestionAnswering.from_pretrained(PRETRAINED_MODEL_NAME) model = model.to(args.device) if args.load_pretrained_model: model.load_state_dict(torch.load(args.pretrained_model_path)) return model, trainloader, devloader, testloader, tokenizer
def __init__( self, lang: str = "en", ): try: from transformers import (AlbertModel, AlbertTokenizer, BertConfig, BertJapaneseTokenizer, BertModel, CamembertModel, CamembertTokenizer) except ImportError: msg = "importing bert dep failed." msg += "\n try to install sister by `pip install sister[bert]`." raise ImportError(msg) if lang == "en": model_name = "albert-base-v2" tokenizer = AlbertTokenizer.from_pretrained(model_name) config = BertConfig.from_pretrained(model_name, output_hidden_states=True) model = AlbertModel.from_pretrained(model_name, config=config) elif lang == "fr": model_name = "camembert-base" tokenizer = CamembertTokenizer.from_pretrained(model_name) config = BertConfig.from_pretrained(model_name, output_hidden_states=True) model = CamembertModel.from_pretrained(model_name, config=config) elif lang == "ja": model_name = "cl-tohoku/bert-base-japanese-whole-word-masking" tokenizer = BertJapaneseTokenizer.from_pretrained(model_name) config = BertConfig.from_pretrained(model_name, output_hidden_states=True) model = BertModel.from_pretrained(model_name, config=config) self.tokenizer = tokenizer self.model = model
def load_transformer(model_type): if model_type == "distilbert": tokenizer = DistilBertTokenizer.from_pretrained( 'distilbert-base-uncased') model = TFDistilBertForSequenceClassification.from_pretrained( "distilbert-base-uncased", num_labels=1) elif model_type == "bert_x12": tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = TFBertForSequenceClassification.from_pretrained( "bert-base-uncased", num_labels=1) elif model_type == "bert_x24": tokenizer = BertTokenizer.from_pretrained('bert-large-uncased') model = TFBertForSequenceClassification.from_pretrained( "bert-large-uncased", num_labels=1) elif model_type == "albert_v2_x12": tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') model = TFAlbertForSequenceClassification.from_pretrained( "albert-base-v2", num_labels=1) elif model_type == "longformer_x12": tokenizer = LongformerTokenizer.from_pretrained( 'allenai/longformer-base-4096') model = TFLongformerForSequenceClassification.from_pretrained( "allenai/longformer-base-4096", num_labels=1) elif model_type == "longformer_x24": tokenizer = LongformerTokenizer.from_pretrained( 'allenai/longformer-large-4096') model = TFLongformerForSequenceClassification.from_pretrained( "allenai/longformer-large-4096", num_labels=1) else: raise ValueError(model_type + " was invalid") return model, tokenizer
def load_test_data(file_path, batch_size=32, tsv=False): df = pd.read_csv(file_path) sentences = df.h.values raw_labels = df.label.values tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') encoding = tokenizer(list(sentences), return_tensors='pt', padding=True, truncation=True, max_length=128).to(device) input_ids = encoding['input_ids'] attention_masks = encoding['attention_mask'] labels = torch.tensor(raw_labels).unsqueeze(1).to(device) test_dataset = TensorDataset(input_ids, attention_masks, labels) # Dataloading prediction_sampler = SequentialSampler(test_dataset) test_dataloader = DataLoader(test_dataset, sampler=prediction_sampler, batch_size=batch_size) return (test_dataloader, sentences, raw_labels)
def build_ban(dataset, num_hid, op='', gamma=4, task='vqa', use_counter=True): #w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, .0, op) #q_emb = QuestionEmbedding(300 if 'c' not in op else 600, num_hid, 1, False, .0) w_emb = AlbertTokenizer.from_pretrained('albert-large-v2') q_emb = AlbertModel.from_pretrained('albert-large-v2') params_set = set() for param in q_emb.parameters(): params_set.add(param) param.requires_grad = False v_att = BiAttention(dataset.v_dim, num_hid, num_hid, gamma) if task == 'vqa': b_net = [] q_prj = [] c_prj = [] objects = 10 # minimum number of boxes for i in range(gamma): b_net.append(BCNet(dataset.v_dim, num_hid, num_hid, None, k=1)) q_prj.append(FCNet([num_hid, num_hid], '', .2)) c_prj.append(FCNet([objects + 1, num_hid], 'ReLU', .0)) classifier = SimpleClassifier(num_hid, num_hid * 2, dataset.num_ans_candidates, .5) counter = Counter(objects) if use_counter else None return BanModel(dataset, params_set, w_emb, q_emb, v_att, b_net, q_prj, c_prj, classifier, counter, op, gamma) elif task == 'flickr': return BanModel_flickr(w_emb, q_emb, v_att, op, gamma)
def load_tokenizer(cls, transformer: str): """ Loads the tokenizer based on th given transformer name. Args: transformer: Name of huggingface transformer """ tokenizer_path = "data/{0}_tokenizer.pkl".format(transformer) if os.path.isfile(tokenizer_path): logger.info("Loading tokenizer from saved path.") with open(tokenizer_path, "rb") as pkl_file: return joblib.load(pkl_file) elif "albert" in transformer: tokenizer = AlbertTokenizer.from_pretrained(transformer, do_lower_case=False) else: tokenizer = BertTokenizer.from_pretrained(transformer, do_lower_case=False, add_special_tokens=True) tokenizer.add_tokens(["[E1]", "[/E1]", "[E2]", "[/E2]", "[BLANK]"]) with open(tokenizer_path, "wb") as output: joblib.dump(tokenizer, output) logger.info("Saved {0} tokenizer at {1}".format( transformer, tokenizer_path)) return tokenizer
def _albert(self): from transformers import AlbertTokenizer # noqa pylint: disable=import-outside-toplevel self.lm_model_tokenizer = AlbertTokenizer.from_pretrained( 'albert-base-v2') self.lm_padding_value = self.lm_model_tokenizer._convert_token_to_id( '<pad>') space_value = self.lm_model_tokenizer._convert_token_to_id('▁') self.id2lm_tokens = {} for i, d in enumerate(self.data): normalized_text = d["normalized_text"] assert isinstance(self.text_tokenizer, EnglishPhonemesTokenizer) or isinstance( self.text_tokenizer, EnglishCharsTokenizer) preprocess_text_as_tts_input = self.text_tokenizer.text_preprocessing_func( normalized_text) lm_tokens_as_ids = self.lm_model_tokenizer.encode( preprocess_text_as_tts_input, add_special_tokens=False) if self.text_tokenizer.pad_with_space: lm_tokens_as_ids = [space_value ] + lm_tokens_as_ids + [space_value] self.id2lm_tokens[i] = lm_tokens_as_ids
def load_data(file_path, oversample=False): df = pd.read_csv(file_path) sentences = df.sentences.values labels = df.labels.values tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') idxs = [] for i in range(len(sentences)): if pd.isnull(sentences[i]): idxs.append(i) sentences = np.delete(sentences, idxs) labels = np.delete(labels, idxs) encoding = tokenizer(list(sentences), return_tensors='pt', padding=True, truncation=True, max_length=128).to(device) input_ids = encoding['input_ids'] attention_masks = encoding['attention_mask'] labels = torch.tensor(labels).unsqueeze(1).to(device) return input_ids, attention_masks, labels