def __init__(self, params, shared=None): super(BiEncoderRanker, self).__init__() self.params = params self.device = torch.device("cuda" if torch.cuda.is_available() and not params["no_cuda"] else "cpu") self.n_gpu = torch.cuda.device_count() # init tokenizer self.NULL_IDX = 0 self.START_TOKEN = "[CLS]" self.END_TOKEN = "[SEP]" self.tokenizer = BertTokenizer.from_pretrained( params["bert_model"], do_lower_case=params["lowercase"]) # init model self.build_model() model_path = params.get("path_to_model", None) if model_path is not None: self.load_model( model_path, cand_enc_only=params.get("load_cand_enc_only", False), ) self.model = self.model.to(self.device) # todo self.data_parallel = params.get("data_parallel") if self.data_parallel: self.model = torch.nn.DataParallel(self.model)
def __init__(self, args): torch.manual_seed(args.seed) self.args = args # Tokenizer, Generator, Discriminator if args.load_epoch > -1: # NOTE: 0-indexed. Load from trained gen_path, dis_path = get_gan_path(self.args.model_out, self.args.load_epoch) else: gen_path, dis_path = args.bert_model, args.bert_model self.tokenizer = BertTokenizer.from_pretrained( gen_path) # TODO requires_grad = False? self.generator = BertForMaskedLM.from_pretrained(gen_path) self.discriminator = BertForSequenceClassification.from_pretrained( dis_path, num_labels=self.args.num_labels) # Optimizer self.optimizerG = self._get_optimizer_(self.generator) self.optimizerD = self._get_optimizer_(self.discriminator) # DataLoader self.msk_data = load_data(args.data_in, args.maxlen, args.batch_size, self.tokenizer, args.seed, 'masked') self.org_data = load_data(args.data_in, args.maxlen, args.batch_size, self.tokenizer, args.seed, 'original') self.mask_id = self.tokenizer.convert_tokens_to_ids(['[MASK]'])[0] self.device = torch.device("cuda:0" if args.cuda else "cpu") self.generator.to(self.device) self.discriminator.to(self.device)
def main(args): print(f"\nmain({args})\n") tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) vocab_list = list(tokenizer.vocab.keys()) if args.rand_mask > 0: fin_name = 'general_in_rand_mask.txt' args.output_dir = args.output_dir / 'rand_mask_bert_pregen' else: fin_name = 'general_in_lcs.txt' args.output_dir = args.output_dir / 'lcs_bert_pregen' with DocumentDatabase(reduce_memory=args.reduce_memory) as docs: with open(os.path.join(args.data_path,fin_name)) as f: for line in tqdm(f, desc="Loading Dataset", unit=" lines"): # mwp_ans is a list of tuples ('hello [MASK] ! [SEP] how are you ? [SEP]', 'world') mwp, ans = line[6:].strip().split('$$$') # [6:] to avoid "[CLS] " sents = mwp.split(' [SEP]')[:-1] ans = ans.split() ans = [[ans.pop(0) for _ in range(s.count('[MASK]'))] for s in sents] #docs.add_document(list(zip([tokenizer.tokenize(s) for s in sents], ans))) docs.add_document(list(zip([s.split() for s in sents], ans))) # It's bert-tokenized in make_data assert len(docs) > 1 args.output_dir.mkdir(exist_ok=True) for epoch in range(args.epochs_to_generate): my_create_training_file(docs, vocab_list, args, epoch)
def load_datasets(task_cfg, splits): tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True) loaders = {} for split in splits: loaders[split] = get_loader(task_cfg, tokenizer, split) return loaders
def __init__(self, params, shared=None): super(CrossEncoderRanker, self).__init__() self.params = params self.device = torch.device("cuda" if torch.cuda.is_available() and not params["no_cuda"] else "cpu") self.n_gpu = torch.cuda.device_count() if params.get("roberta"): self.tokenizer = RobertaTokenizer.from_pretrained( params["bert_model"], ) else: self.tokenizer = BertTokenizer.from_pretrained( params["bert_model"], do_lower_case=params["lowercase"]) special_tokens_dict = { "additional_special_tokens": [ ENT_START_TAG, ENT_END_TAG, ENT_TITLE_TAG, ], } self.tokenizer.add_special_tokens(special_tokens_dict) self.NULL_IDX = self.tokenizer.pad_token_id self.START_TOKEN = self.tokenizer.cls_token self.END_TOKEN = self.tokenizer.sep_token # init model self.build_model() if params["path_to_model"] is not None: self.load_model(params["path_to_model"]) self.model = self.model.to(self.device) self.data_parallel = params.get("data_parallel") if self.data_parallel: self.model = torch.nn.DataParallel(self.model)
def load_dataset(task_cfg, split): assert (split == "eval") tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True) loaders = {} loaders[split] = get_loader(task_cfg, tokenizer, split) return loaders
def __init__(self,args=None, labels=None, device='cuda', bert_model_path='bert-base-uncased', architecture="DocumentBertLSTM", batch_size=10, bert_batch_size=7, learning_rate = 5e-5, weight_decay=0, use_tensorboard=False): if args is not None: self.args = vars(args) if not args: self.args = {} self.args['bert_model_path'] = bert_model_path self.args['device'] = device self.args['learning_rate'] = learning_rate self.args['weight_decay'] = weight_decay self.args['batch_size'] = batch_size self.args['labels'] = labels self.args['bert_batch_size'] = bert_batch_size self.args['architecture'] = architecture self.args['use_tensorboard'] = use_tensorboard if 'fold' not in self.args: self.args['fold'] = 0 assert self.args['labels'] is not None, "Must specify all labels in prediction" self.log = logging.getLogger() self.bert_tokenizer = BertTokenizer.from_pretrained(self.args['bert_model_path']) #account for some random tensorflow naming scheme if os.path.exists(self.args['bert_model_path']): if os.path.exists(os.path.join(self.args['bert_model_path'], CONFIG_NAME)): config = BertConfig.from_json_file(os.path.join(self.args['bert_model_path'], CONFIG_NAME)) elif os.path.exists(os.path.join(self.args['bert_model_path'], 'bert_config.json')): config = BertConfig.from_json_file(os.path.join(self.args['bert_model_path'], 'bert_config.json')) else: raise ValueError("Cannot find a configuration for the BERT based model you are attempting to load.") else: config = BertConfig.from_pretrained(self.args['bert_model_path']) config.__setattr__('num_labels',len(self.args['labels'])) config.__setattr__('bert_batch_size',self.args['bert_batch_size']) if 'use_tensorboard' in self.args and self.args['use_tensorboard']: assert 'model_directory' in self.args is not None, "Must have a logging and checkpoint directory set." from torch.utils.tensorboard import SummaryWriter self.tensorboard_writer = SummaryWriter(os.path.join(self.args['model_directory'], "..", "runs", self.args['model_directory'].split(os.path.sep)[-1]+'_'+self.args['architecture']+'_'+str(self.args['fold']))) self.bert_doc_classification = document_bert_architectures[self.args['architecture']].from_pretrained(self.args['bert_model_path'], config=config) self.optimizer = torch.optim.Adam( self.bert_doc_classification.parameters(), weight_decay=self.args['weight_decay'], lr=self.args['learning_rate'] )
def __init__(self, debug, args, data_dir, data_process_output): self.eval_steps = args.eval_steps self.adam_epsilon = args.adam_epsilon self.warmup_steps = args.warmup_steps self.learning_rate = args.learning_rate self.weight_decay = args.weight_decay self.gradient_accumulation_steps = args.gradient_accumulation_steps self.device = torch.device('cuda') self.debug = debug self.seed = 2019 self.args = args self.data_dir = args.data_dir self.max_seq_length = args.max_seq_length self.batch_size = args.per_gpu_train_batch_size self.train_steps = args.train_steps self.tokenizer = BertTokenizer.from_pretrained( args.model_name_or_path, do_lower_case=args.do_lower_case) self.config = BertConfig.from_pretrained(args.model_name_or_path, num_labels=3) self.seed_everything() self.do_eval = True self.data_dir = data_dir self.data_process_output = data_process_output self.output_dir = './'
def __init__(self): self.model_path = 'output/model' #self.processor = ATEPCProcessor() #self.labels = self.processor.get_labels() #self.n_class = len(self.labels) self.tokenizer = BertTokenizer.from_pretrained('./output/model/vocab.txt') self.device = torch.device("cuda:5" if torch.cuda.is_available() else 'cpu')
def __init__(self, params, shared=None): super(BiEncoderRanker, self).__init__() self.params = params self.device = torch.device("cuda" if torch.cuda.is_available() and not params["no_cuda"] else "cpu") self.n_gpu = torch.cuda.device_count() # init tokenizer self.NULL_IDX = 0 self.START_TOKEN = "[CLS]" self.END_TOKEN = "[SEP]" vocab_path = os.path.join(params["bert_model"], 'vocab.txt') if os.path.isfile(vocab_path): print(f"Found tokenizer vocabulary at {vocab_path}") self.tokenizer = BertTokenizer.from_pretrained( vocab_path if os.path.isfile(vocab_path) else params["bert_model"], do_lower_case=params["lowercase"]) # init model self.build_model() # Path to pytorch_model.bin for the biencoder model (not the pre-trained BERT model) model_path = params.get("path_to_biencoder_model") if model_path is None: model_path = params.get("path_to_model") if model_path is not None: self.load_model(model_path) self.model = self.model.to(self.device) self.data_parallel = params.get("data_parallel") if self.data_parallel: self.model = torch.nn.DataParallel(self.model)
def __init__(self, data_dir, output_dir, num_labels, args): self.data_dir = data_dir self.output_dir = output_dir self.num_labels = num_labels self.weight_decay = args.weight_decay self.eval_steps = args.eval_steps self.gradient_accumulation_steps = args.gradient_accumulation_steps self.warmup_steps = args.warmup_steps self.learning_rate = args.learning_rate self.adam_epsilon = args.adam_epsilon self.train_steps = args.train_steps self.per_gpu_eval_batch_size = args.per_gpu_eval_batch_size self.train_batch_size = args.per_gpu_train_batch_size self.eval_batch_size = self.per_gpu_eval_batch_size self.do_lower_case = args.do_lower_case self.model_name_or_path = args.model_name_or_path self.max_seq_length = args.max_seq_length self.seed = args.seed self.seed_everything() self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.tokenizer = BertTokenizer.from_pretrained( self.model_name_or_path, do_lower_case=self.do_lower_case) self.do_test = args.do_test self.do_eval = True self.args = args
def main(): parser = ArgumentParser() parser.add_argument('--train_corpus', type=Path, required=True) parser.add_argument("--output_dir", type=Path, required=True) parser.add_argument("--bert_model", type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.") parser.add_argument("--do_lower_case", action="store_true") parser.add_argument("--do_whole_word_mask", action="store_true", help="Whether to use whole word masking rather than per-WordPiece masking.") parser.add_argument("--reduce_memory", action="store_true", help="Reduce memory usage for large datasets by keeping data on disc rather than in memory") parser.add_argument("--num_workers", type=int, default=1, help="The number of workers to use to write the files") parser.add_argument("--epochs_to_generate", type=int, default=3, help="Number of epochs of data to pregenerate") parser.add_argument("--max_seq_len", type=int, default=128) parser.add_argument("--short_seq_prob", type=float, default=0.1, help="Probability of making a short sentence as a training example") parser.add_argument("--masked_lm_prob", type=float, default=0.15, help="Probability of masking each token for the LM task") parser.add_argument("--max_predictions_per_seq", type=int, default=20, help="Maximum number of tokens to mask in each sequence") args = parser.parse_args() if args.num_workers > 1 and args.reduce_memory: raise ValueError("Cannot use multiple workers while reducing memory") tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) vocab_list = list(tokenizer.vocab.keys()) with DocumentDatabase(reduce_memory=args.reduce_memory) as docs: with args.train_corpus.open() as f: doc = [] for line in tqdm(f, desc="Loading Dataset", unit=" lines"): line = line.strip() if line == "": docs.add_document(doc) doc = [] else: tokens = tokenizer.tokenize(line) doc.append(tokens) if doc: docs.add_document(doc) # If the last doc didn't end on a newline, make sure it still gets added if len(docs) <= 1: exit("ERROR: No document breaks were found in the input file! These are necessary to allow the script to " "ensure that random NextSentences are not sampled from the same document. Please add blank lines to " "indicate breaks between documents in your input file. If your dataset does not contain multiple " "documents, blank lines can be inserted at any natural boundary, such as the ends of chapters, " "sections or paragraphs.") args.output_dir.mkdir(exist_ok=True) if args.num_workers > 1: writer_workers = Pool(min(args.num_workers, args.epochs_to_generate)) arguments = [(docs, vocab_list, args, idx) for idx in range(args.epochs_to_generate)] writer_workers.starmap(create_training_file, arguments) else: for epoch in trange(args.epochs_to_generate, desc="Epoch"): create_training_file(docs, vocab_list, args, epoch)
def __init__(self, params): 'Initialization' self.numDataPoints = {} num_samples_train = params['num_train_samples'] num_samples_val = params['num_val_samples'] self._image_features_reader = ImageFeaturesH5Reader( params['visdial_image_feats']) with open(params['visdial_processed_train_dense']) as f: self.visdial_data_train = json.load(f) if params['overfit']: if num_samples_train: self.numDataPoints['train'] = num_samples_train else: self.numDataPoints['train'] = 5 else: if num_samples_train: self.numDataPoints['train'] = num_samples_train else: self.numDataPoints['train'] = len( self.visdial_data_train['data']['dialogs']) with open(params['visdial_processed_val']) as f: self.visdial_data_val = json.load(f) if params['overfit']: if num_samples_val: self.numDataPoints['val'] = num_samples_val else: self.numDataPoints['val'] = 5 else: if num_samples_val: self.numDataPoints['val'] = num_samples_val else: self.numDataPoints['val'] = len( self.visdial_data_val['data']['dialogs']) self.overfit = params['overfit'] with open(params['visdial_processed_train_dense_annotations']) as f: self.visdial_data_train_ndcg = json.load(f) with open(params['visdial_processed_val_dense_annotations']) as f: self.visdial_data_val_ndcg = json.load(f) #train val setup self.numDataPoints['trainval'] = self.numDataPoints[ 'train'] + self.numDataPoints['val'] self.num_options = params["num_options"] self._split = 'train' self.subsets = ['train', 'val', 'trainval'] tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') self.tokenizer = tokenizer # fetching token indicecs of [CLS] and [SEP] tokens = ['[CLS]', '[MASK]', '[SEP]'] indexed_tokens = tokenizer.convert_tokens_to_ids(tokens) self.CLS = indexed_tokens[0] self.MASK = indexed_tokens[1] self.SEP = indexed_tokens[2] self.params = params self._max_region_num = 37
def main(): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() set_seed() # args.output_dir=os.path.join(args.data_dir,args.task_name,args.output_dir) # args.temp_score_file_path=os.path.join(args.data_dir,args.task_name,args.temp_score_file_path) # args.input_cache_dir=os.path.join(args.data_dir, args.task_name, args.input_cache_dir) # if not os.path.exists(args.output_dir): # os.makedirs(args.output_dir) # if not os.path.exists(args.input_cache_dir): # os.makedirs(args.input_cache_dir) # myDataProcessorUtt = MyDataProcessorUtt(args.max_utterance_num) myDataProcessorSeg = MyDataProcessorSegres() # label_list = myDataProcessorUtt.get_labels() # num_labels = len(label_list) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) config = BertConfig.from_pretrained(args.bert_model) if args.do_train: logger.info("start train...") output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) # if(os.path.exists(output_model_file)): # logger.info("load dict...") # model_state_dict = torch.load(output_model_file) # model = BertForSequenceClassificationTS.from_pretrained(args.bert_model, config=config, # state_dict=model_state_dict, num_labels=num_labels) # else: model = BertForSequenceClassificationTSv3.from_pretrained( args.bert_model, config=config, max_seg_num=args.max_segment_num, max_seq_len=args.max_seq_length, device=device) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) train(model, tokenizer, device, myDataProcessorSeg, n_gpu) else: logger.info("start test...") logger.info("load dict...") output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) model_state_dict = torch.load(output_model_file) model = BertForSequenceClassificationTSv3.from_pretrained( args.bert_model, config=config, state_dict=model_state_dict, max_seg_num=args.max_segment_num, max_seq_len=args.max_seq_length, device=device) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # similar_score(model, tokenizer, device,myDataProcessorSeg) result = eval(model, tokenizer, device, myDataProcessorSeg) logger.info( "Evaluation Result: \nMAP: %f\tMRR: %f\tP@1: %f\tR1: %f\tR2: %f\tR5: %f", result[0], result[1], result[2], result[3], result[4], result[5]) print(result)
def main(): tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) reader = BertMCQParallelReader() out = reader.read("dummy_data.jsonl", tokenizer, 70, None) print(len(out)) tokens, segs, masks, labels = out[0]
def __init__(self, config, *args, **kwargs): self.max_length = config.max_length self.bert_tokenizer = BertTokenizer.from_pretrained( 'bert-base-uncased') assert self.bert_tokenizer.encode(self.bert_tokenizer.pad_token) == [0] self.get_qgen_inds = getattr(config, 'get_qgen_inds', False) if self.get_qgen_inds: print('computing question generation indices in bert tokenizer')
def preprocess(self, data, opt): """ Preprocess the data and convert to ids. """ processed = [] tqdm_data = tqdm(data) if opt["lower"]: tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') else: tokenizer = BertTokenizer.from_pretrained('bert-base-cased') for d in tqdm_data: bert_tokenize(tokenizer, d, opt) tokens = list(d["token"]) seq_len = len(tokens) + 2 # anonymize tokens ss, se = d['subj_start'], d['subj_end'] os, oe = d['obj_start'], d['obj_end'] pos = d['stanford_pos'] ner = d['stanford_ner'] deprel = d['stanford_deprel'] head = [int(x) for x in d['stanford_head']] assert any([x == 0 for x in head]) positions = get_positions(d['subj_start'] + 1, d['subj_end'] + 1, d['obj_start'] + 1, d['obj_end'] + 1, self.e_type2idx[d["subj_type"]], self.e_type2idx[d["obj_type"]], seq_len) subj_type = d['subj_type'] obj_type = d['obj_type'] relation = self.r_type2idx[d['relation']] processed.append({ "len": seq_len, "tokens": tokens, "pos": pos, "ner": ner, "deprel": deprel, "head": head, "position": positions, "s_type": subj_type, "o_type": obj_type, "relation": relation }) return processed
def __init__(self) -> None: os.environ[ 'CORENLP_HOME'] = '{}/stanford-corenlp-full-2018-10-05'.format( os.environ['HOME']) self.client = CoreNLPClient() self.client.ensure_alive() self.do_lower_case = '-cased' not in config.bert_model self.basic_tokenizer: BasicTokenizer \ = BertTokenizer.from_pretrained(config.bert_model, do_lower_case=self.do_lower_case).basic_tokenizer
def make_gan_data(mathqa_train, out_dir, bert_model, do_lower_case, subset=1e8): print('ma') """ Parameters: mathqa_train str path to MathQA train.json out_dir str path to output dir subset int size of subset adopted NOTE: use the whole train set rather than 3k (actually 2.4k) for training """ def is_bad(sent): # 21476 out of 29837 MWPs used (0.7198) if any([s in '+-*/|@' for s in sent]): return True return True if sum([1 for s in sent if s.isalpha()]) < len(sent) / 2 else False obj = [] with open(mathqa_train, 'r') as jsonfile: obj = json.load(jsonfile) print(f'{len(obj)} MWPs from {mathqa_train}') good_mwps, bad_mwps = [], [] for imwp in trange(len(obj)): mwp = obj[imwp] if is_bad(mwp['Problem']): #or mwp['category'] == 'other':#TODO bad_mwps.append(mwp) continue # 27688 out of 29837, i.e. 92.8% kept good_mwps.append(mwp) if len(good_mwps) == subset: break if not os.path.exists(out_dir): os.makedirs(out_dir) tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=do_lower_case) with open(os.path.join(out_dir, 'mathqa.txt'), 'w') as fout: for imwp in trange(len(good_mwps)): mwp = good_mwps[imwp] problem = ' [SEP]'.join(sent_tokenize(mwp['Problem'])) + ' [SEP]' toks = [CLS] for tok in problem.split(): if tok is not SEP: toks.extend(tokenizer.tokenize(tok)) #if random() > 0.5: # toks.extend(['[MASK]']*len(tokenizer.tokenize(tok))) else: toks.append(SEP) fout.writelines(' '.join(toks) + '@@@' + mwp['category'] + '\n')
def __init__(self, config, *args, **kwargs): self.max_length = config.max_length pythia_root = get_pythia_root() VOCAB = 'bert-base-uncased-vocab.txt' self.bert_tokenizer = BertTokenizer.from_pretrained( os.path.join(pythia_root, config.model_data_dir, 'bert', VOCAB)) assert self.bert_tokenizer.encode(self.bert_tokenizer.pad_token) == [0] self.get_qgen_inds = getattr(config, 'get_qgen_inds', False) if self.get_qgen_inds: print('computing question generation indices in bert tokenizer')
def load_pretrain(configs, model_class, fine_tune_dir, processor, eval=False): """ configs: 配置文件 model_class: 模型名称 fine_tune_dir: 微调模型保存路径 processor: DataProcessor eval: 是否验证 """ model_class_map = { 'Bert': Bert, 'BertCRF': BertCRF, 'BertBiLSTMCRF': BertBiLSTMCRF, 'BiLSTM': BiLSTM, 'BiLSTMCRF': BiLSTMCRF } model_class_ = model_class_map[model_class] label_list = processor.get_labels() check_dir(fine_tune_dir) if eval: model_pretrained_path = fine_tune_dir else: model_pretrained_path = configs['pretrained_model_dir'] tokenizer = BertTokenizer.from_pretrained( model_pretrained_path, do_lower_case=configs['lower_case']) if model_class in ['Bert', 'BertCRF', 'BertBiLSTMCRF']: bert_config = BertConfig.from_pretrained(model_pretrained_path, num_labels=len(label_list), finetuning_task="ner") model = model_class_.from_pretrained(model_pretrained_path, config=bert_config, model_configs=configs) elif model_class in ['BiLSTM', 'BiLSTMCRF']: configs['num_labels'] = len(label_list) if configs['use_pretrained_embedding']: pretrained_word_embed = build_word_embed( tokenizer, pretrain_embed_file=configs['pretrain_embed_file'], pretrain_embed_pkl=configs['pretrain_embed_pkl']) configs['word_vocab_size'] = pretrained_word_embed.shape[0] configs['word_embedding_dim'] = pretrained_word_embed.shape[1] else: pretrained_word_embed = None if eval: model_pretrained_path = fine_tune_dir model = model_class_.from_pretrained(model_pretrained_path, pretrained_word_embed) else: model = model_class_(configs, pretrained_word_embed) else: raise ValueError("Invalid Model Class") return model, tokenizer
def test_sequence_builders(self): tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") text = tokenizer.encode("sequence builders") text_2 = tokenizer.encode("multi-sequence build") encoded_sentence = tokenizer.add_special_tokens_single_sentence(text) encoded_pair = tokenizer.add_special_tokens_sentences_pair( text, text_2) assert encoded_sentence == [101] + text + [102] assert encoded_pair == [101] + text + [102] + text_2 + [102]
def LoadDatasetEval(args, config, task_cfg, task_id): if "roberta" in args.bert_model: tokenizer = RobertaTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) else: tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) task = "TASK" + task_id task_name = task_cfg[task]["name"] # initialize the feature reader feats_h5path1 = task_cfg[task]["features_h5path1"] feats_h5path2 = task_cfg[task]["features_h5path2"] features_reader1 = ImageFeaturesH5Reader(feats_h5path1, config, args.in_memory) if feats_h5path1 != "" else None features_reader2 = ImageFeaturesH5Reader(feats_h5path2, config, args.in_memory) if feats_h5path2 != "" else None batch_size = task_cfg[task].get("eval_batch_size", args.batch_size) if args.local_rank != -1: batch_size = int(batch_size / dist.get_world_size()) logger.info("Loading %s Dataset with batch size %d" % (task_name, batch_size)) if args.split: eval_split = args.split else: eval_split = task_cfg[task]["val_split"] dset_val = DatasetMapEval[task_name]( task=task_cfg[task]["name"], dataroot=task_cfg[task]["dataroot"], annotations_jsonpath=task_cfg[task]["val_annotations_jsonpath"], split=eval_split, image_features_reader=features_reader1, gt_image_features_reader=features_reader2, tokenizer=tokenizer, bert_model=args.bert_model, padding_index=0, max_seq_length=task_cfg[task]["max_seq_length"], max_region_num=task_cfg[task]["max_region_num"], num_locs=config.num_locs, add_global_imgfeat=config.add_global_imgfeat, append_mask_sep=(config.fusion_method == 'vl-bert_vqa'), ) dl_val = DataLoader( dset_val, shuffle=False, batch_size=batch_size, num_workers=10, pin_memory=True, drop_last=args.drop_last, ) task2num_iters = {task: len(dl_val)} return batch_size, task2num_iters, dset_val, dl_val
def LoadDatasets(args, task_cfg, ids, split="trainval"): tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True) task_feature_reader1 = {} task_feature_reader2 = {} for i, task_id in enumerate(ids): task = "TASK" + task_id + "1" if task_cfg[task]["features_h5path1"] not in task_feature_reader1: task_feature_reader1[task_cfg[task]["features_h5path1"]] = None if task_cfg[task]["features_h5path2"] not in task_feature_reader2: task_feature_reader2[task_cfg[task]["features_h5path2"]] = None
def model_samples_from_json(config, token_id_dict, unknown_token_id, type_id_dict, mentions_file, sents_file): if config.use_bert: tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False) print('bert tokenizer loaded') sent_tokens_id_dict = dict() sent_tokens_dict = dict() with open(sents_file, encoding='utf-8') as f: for line in f: sent = json.loads(line) tokens = sent['text'].split(' ') sent_tokens_id_dict[sent['sent_id']] = [token_id_dict.get(t, unknown_token_id) for t in tokens] sent_tokens_dict[sent['sent_id']] = [t for t in tokens] samples = list() mentions = datautils.read_json_objs(mentions_file) for m in mentions: if config.use_bert: org_tok_sents = sent_tokens_dict[m['sent_id']] bert_sent_tokens = org_tok_sents[:m['span'][0]] + ['[MASK]'] + org_tok_sents[m['span'][1]:] full_sent = ' '.join(bert_sent_tokens) tokens = ["[CLS]"] t = tokenizer.tokenize(full_sent) tokens.extend(t) mention_token_idx = 0 for i, x in enumerate(tokens): if x == '[MASK]': mention_token_idx = i break tokens.append("[SEP]") sentence_token = tokenizer.convert_tokens_to_ids(tokens) else: sentence_token = sent_tokens_id_dict[m['sent_id']] mention_token_idx = m['span'][0] labels = m['labels'] label_ids = [type_id_dict[t] for t in labels] sample = [m['mention_id'], sent_tokens_id_dict[m['sent_id']][m['span'][0]:m['span'][1]], sentence_token, mention_token_idx, label_ids ] samples.append(sample) return samples
def __init__(self, params): super(CrossEncoderRanker, self).__init__() self.params = params self.device = torch.device( "cuda" if torch.cuda.is_available() and not params["no_cuda"] else "cpu" ) self.n_gpu = torch.cuda.device_count() if params.get("roberta"): self.tokenizer = RobertaTokenizer.from_pretrained(params["bert_model"], do_lower_case=params["lowercase"]) else: self.tokenizer = BertTokenizer.from_pretrained( params["bert_model"], do_lower_case=params["lowercase"] ) special_tokens_dict = { "additional_special_tokens": [ ENT_START_TAG, ENT_END_TAG, ENT_TITLE_TAG, ], } self.tokenizer.add_special_tokens(special_tokens_dict) self.NULL_IDX = self.tokenizer.pad_token_id self.START_TOKEN = self.tokenizer.cls_token self.END_TOKEN = self.tokenizer.sep_token self.START_MENTION_ID = self.tokenizer.convert_tokens_to_ids(ENT_START_TAG) self.END_MENTION_ID = self.tokenizer.convert_tokens_to_ids(ENT_END_TAG) # keep some parameters around self.add_sigmoid = params["add_sigmoid"] self.margin = params["margin"] self.objective = params["objective"] self.pos_neg_loss = params.get("pos_neg_loss", False) assert self.objective == "softmax" or self.objective == "max_margin" # init model self.build_model() if params["path_to_model"] is not None: self.load_model(params["path_to_model"]) self.model = self.model.to(self.device) self.data_parallel = params.get("data_parallel") if self.data_parallel: self.model = torch.nn.DataParallel(self.model)
def score_weak_learner_physical_v2(fname, data): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() model_dir = "/scratch/kkpal/serdir_bertlgww_concat_kb_1e5/" model = BertMCQConcat.from_pretrained( model_dir, cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(-1))) model.to(device) model = torch.nn.DataParallel(model) data_reader = BertMCQConcatReader() tokenizer = BertTokenizer.from_pretrained( "bert-large-uncased-whole-word-masking", do_lower_case=True) eval_data = data_reader.read_json(json=data, tokenizer=tokenizer, max_seq_len=128, max_number_premises=10) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=128) etq = tqdm(eval_dataloader, desc="Scoring") scores = [] for input_ids, segment_ids, input_mask, label_ids in etq: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): outputs = model(input_ids, segment_ids, input_mask, label_ids) tmp_eval_loss = outputs[0] logits = outputs[1] logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() for logit, label in zip(logits, label_ids): scores.append(softmax(logit)[label]) with jsonlines.open(fname + "_v2_scores.jsonl", mode='w') as writer: for row, score in zip(data, scores): if verbose: print(row["id"], score) writer.write({"id": row["id"], "score": score})
def bertTokenizer(*args, **kwargs): """ Instantiate a BertTokenizer from a pre-trained/customized vocab file Args: pretrained_model_name_or_path: Path to pretrained model archive or one of pre-trained vocab configs below. * bert-base-uncased * bert-large-uncased * bert-base-cased * bert-large-cased * bert-base-multilingual-uncased * bert-base-multilingual-cased * bert-base-chinese Keyword args: cache_dir: an optional path to a specific directory to download and cache the pre-trained model weights. Default: None do_lower_case: Whether to lower case the input. Only has an effect when do_wordpiece_only=False Default: True do_basic_tokenize: Whether to do basic tokenization before wordpiece. Default: True max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the minimum of this value (if specified) and the underlying BERT model's sequence length. Default: None never_split: List of tokens which will never be split during tokenization. Only has an effect when do_wordpiece_only=False Default: ["[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"] Example: >>> import torch >>> sentence = 'Hello, World!' >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False) >>> toks = tokenizer.tokenize(sentence) ['Hello', '##,', 'World', '##!'] >>> ids = tokenizer.convert_tokens_to_ids(toks) [8667, 28136, 1291, 28125] """ tokenizer = BertTokenizer.from_pretrained(*args, **kwargs) return tokenizer
def main(): batch_size = 16 max_seq_len = 128 model_dir = 'fine_tuned--bert-base-uncased--SEQ_LEN=128--BATCH_SIZE=32--HEAD=1' output_filename = os.path.join( model_dir, "fine-tuned-sent-classifer-test-results.csv") test_sets_dir = "dataset\custom_test_set" test_files = [filename for filename in os.listdir(test_sets_dir)] device = torch.device("cuda" if torch.cuda.is_available() else "cpu") tokenizer = BertTokenizer.from_pretrained(model_dir) model = BertForSequenceClassification.from_pretrained(model_dir) model.to(device) criterion = Softmax() accuracies = {} for filename in test_files: print("Testing on dataset: {}".format(filename)) file_path = os.path.join(test_sets_dir, filename) test_dataset = Dataset(file_path, tokenizer, max_seq_len) test_dataloader = data.DataLoader(test_dataset, batch_size=batch_size) accuracy = 0 for batch in test_dataloader: with torch.no_grad(): batch = (t.to(device) for t in batch) input_ids, input_mask, segment_ids, labels = batch outputs = model(input_ids, input_mask, segment_ids) logits = outputs[0] _, predictions = criterion(logits).max(-1) results = predictions == labels accuracy += results.sum().item() accuracy = accuracy / len(test_dataset) print("Model achieved {}'%' accuracy".format(accuracy)) dataset_name = filename.split('.')[0] accuracies[dataset_name] = accuracy with open(output_filename, 'w') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=accuracies.keys()) writer.writeheader() writer.writerow(accuracies)
def __init__(self, args): self.valid_step = 1 self.warmup_steps = 0 self.adam_epsilon = 1e-8 self.data_dir = './datapro/ubuntu/' self.model_name_or_path = './uncased_L-12_H-768_A-12/' self.tokenizer = BertTokenizer.from_pretrained(self.model_name_or_path, do_lower_case=True) self.learning_rate = 5e-5 self.weight_decay = 0.0 self.train_steps = 10 self.device = torch.device('cuda') self.debug_mode = False self.model_name = 'bert' self.seed = 2019 self.seed_everything() self.max_len = 128 self.epochs = 5 self.batch_size = 16 self.num_labels = 2 self.args = args