def get_bert(BERT_PT_PATH, bert_type, do_lower_case, my_pretrain_bert): # bert_config_file = os.path.join(BERT_PT_PATH, f'bert_config_{bert_type}.json') # vocab_file = os.path.join(BERT_PT_PATH, f'vocab_{bert_type}.txt') # init_checkpoint = os.path.join(BERT_PT_PATH, f'pytorch_model_{bert_type}.bin') # bert_config = BertConfig.from_json_file(bert_config_file) # tokenizer = tokenization.FullTokenizer( # vocab_file=vocab_file, do_lower_case=do_lower_case) # bert_config.print_status() # model_bert = BertModel(bert_config) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=args.do_lower_case) model_bert, bert_config = BertModel.from_pretrained('bert-base-uncased') if my_pretrain_bert: model_bert.load_state_dict( torch.load(init_checkpoint, map_location='cpu')) print("Load pre-trained parameters.") else: pass model_bert.to(device) return model_bert, tokenizer, bert_config
def get_data_iterators_yelp(train_lm=False, map_cpu=False): text_field = tt.data.Field(lower=args.lower) label_field = tt.data.LabelField(sequential=False, unk_token=None) length_field = tt.data.Field(sequential=False, use_vocab=False) offset_field = tt.data.Field(sequential=False, use_vocab=False) path_format = './.data/yelp_review_polarity_csv/%s.csv.token' bert_tokenizer = None if args.use_bert_tokenizer: bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir='bert/cache') train_examples, test_examples = (get_examples_yelp(path_format % ds, train_lm, bert_tokenizer=bert_tokenizer) for ds in ['train', 'test']) dev_examples = test_examples[:500] train, dev, test = (tt.data.Dataset(ex, [('text', text_field), ('length', length_field), ('offset', offset_field), ('label',label_field)]) for ex in [train_examples, dev_examples, test_examples]) vocab_path = 'vocab/vocab_yelp.pkl' if not args.use_bert_tokenizer else 'vocab/vocab_yelp_bert.pkl' if args.fix_test_vocab and not args.use_bert_tokenizer: vocab_path = 'vocab/vocab_yelp_fix.pkl' c_postfix = '.yelp' if args.use_bert_tokenizer: c_postfix += '.bert' if args.fix_test_vocab: c_postfix += '.fix' handle_vocab(vocab_path, text_field, (train, test), args.vector_cache + c_postfix, train_lm, max_size=20000) label_field.build_vocab(train) train_iter, dev_iter, test_iter = ( tt.data.BucketIterator(x, batch_size=args.batch_size, device=args.gpu if not map_cpu else 'cpu', shuffle=False) for x in (train, dev, test)) return text_field, label_field, train_iter, dev_iter, test_iter, train, dev
def __init__(self, sample_list, max_query_length, max_seq_length, train_flag=False, device=None): super(BertRCDataset, self).__init__(sample_list, device) self.max_query_length = max_query_length self.max_seq_length = max_seq_length self.tokenizer = BertTokenizer( '%s/vocab.txt' % ('./pretrained/chinese_wwm_ext_pytorch')) self.cvt = BertInputConverter(self.tokenizer) self.train_flag = train_flag self.add_bert_fields() if train_flag: self.sample_list = [ d for d in self.sample_list if len(d['char_spans']) == 1 ] for sample in self.sample_list: tmp = self.cvt.convert(sample['question'], sample['passage'], self.max_query_length, self.max_seq_length, to_tensor=False) (input_ids, input_mask, segment_ids) = tmp['input'], tmp['att_mask'], tmp['seg'] sample.update({ 'input_ids': input_ids, 'input_mask': input_mask, 'segment_ids': segment_ids }) if train_flag: ss, se = sample['char_spans'][0] sample['bert_span'] = tmp['pos_map'][ss], tmp['pos_map'][se]
def get_data_iterators_sst_flatten(train_lm=False, map_cpu=False): text_field = tt.data.Field(lower=args.lower) length_field = tt.data.Field(sequential=False, use_vocab=False) offset_field = tt.data.Field(sequential=False, use_vocab=False) _, _, _ = tt.datasets.SST.splits(text_field, length_field, fine_grained=False, train_subtrees=False, filter_pred=lambda ex: ex.label != 'neutral') path_format = './.data/sst/trees/%s.txt' bert_tokenizer = None if args.use_bert_tokenizer: bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir='bert/cache') train_ex, dev_ex, test_ex = (get_examples_sst(path_format % ds, train_lm, bert_tokenizer=bert_tokenizer) for ds in ['train', 'dev', 'test']) train, dev, test = (tt.data.Dataset(ex, [('text', text_field), ('length', length_field), ('offset', offset_field)]) for ex in [train_ex, dev_ex, test_ex]) vocab_path = 'vocab/vocab_sst.pkl' if not args.use_bert_tokenizer else 'vocab/vocab_sst_bert.pkl' c_postfix = '.sst' if args.use_bert_tokenizer: c_postfix += '.bert' handle_vocab(vocab_path, text_field, (train, dev, test), args.vector_cache + c_postfix, train_lm) train_iter, dev_iter, test_iter = ( tt.data.BucketIterator(x, batch_size=args.batch_size, device=args.gpu if not map_cpu else 'cpu', shuffle=False) for x in (train, dev, test)) return text_field, length_field, train_iter, dev_iter, test_iter, train, dev
def test_evaluate_on_file(): BERT_SERIALIZATION_DIR = './pretrained/chinese_wwm_ext_pytorch' tokenizer = BertTokenizer('%s/vocab.txt'%(BERT_SERIALIZATION_DIR)) device = torch.device('cpu') num_fn = functools.partial(generate_bert_pointwise_input,max_seq_len=200,max_passage_len=100,tokenizer=tokenizer,device=device) fake_model1 = lambda x,y,z: [[0,1] for _ in range(len(x))] fake_model2 = lambda x,y,z: [[1,0] for _ in range(len(x))] fake_model3 = lambda x,y,z: [[random.choice([0,1]),random.choice([0,1])]for _ in range(len(x))] fake_model4 = lambda x,y,z: [[random.uniform(0,1),random.choice([0,1])] for _ in range(len(x))] fake_model1.eval = lambda :None fake_model2.eval = lambda :None fake_model3.eval = lambda :None fake_model4.eval = lambda :None test_path = './data/demo/devset/search.dev.2.json' results1 = evaluate_on_file(test_path,fake_model1,num_fn,[('accuracy',accuracy),('precision',precision)]) results2 = evaluate_on_file(test_path,fake_model2,num_fn,[('accuracy',accuracy),('precision',precision)]) results3 = evaluate_on_file(test_path,fake_model3,num_fn,[('accuracy',accuracy),('precision',precision)]) results4 = evaluate_on_file(test_path,fake_model4,num_fn,[('precision',precision),('precision2',functools.partial(precision,k=2))]) X,y = load_examples_from_scratch(test_path,concat=False,attach_label='most_related_para') assert results1['accuracy'] == sum(y)/len(y) assert results2['accuracy'] == (len(y)-sum(y))/len(y) assert precision([[-1,1],[0,2],[0,3],[-1,-1]],[0,1,0,0],k=2)==0.5 assert precision([[-1,1],[0,2],[0,3],[-1,-1]],[0,0,0,1],k=2)==0 print(results3['accuracy']) print(results1['precision']) print(results2['precision']) print(results4['precision']) print(results4['precision2'])
def model_factory(bert_path, device=None, tokenizer=None, **kwargs): if device is None: device = get_default_device() if tokenizer is None: tokenizer = BertTokenizer('%s/vocab.txt' % (bert_path)) model = BertForSequenceClassification.from_pretrained(bert_path, num_labels=2, **kwargs).to(device) return model, tokenizer, device
def data2network(data_struct, data_type, params): # input sent_words = data_struct['sentences'] # words org_sent_words = sent_words['sent_words'] sent_words = prep_sentences(sent_words, data_type, params) wordsIDs = _elem2idx(sent_words, params['mappings']['word_map']) all_sentences = [] # nner: Using subwords: tokenizer = BertTokenizer.from_pretrained(params['bert_model'], do_lower_case=False) for xx, sid in enumerate(data_struct['input']): # input sentence_data = data_struct['input'][sid] # document id fid = sid.split(':')[0] # words to ids word_ids = wordsIDs[xx] words = org_sent_words[xx] # entity readable_e, idxs, ents, toks2, etypes2ids, entities, sw_sentence, sub_to_word, subwords, valid_starts, tagsIDs, terms = entity2network( sentence_data, words, params, tokenizer) # return sentence_vector = OrderedDict() sentence_vector['fid'] = fid sentence_vector['ents'] = ents sentence_vector['word_ids'] = word_ids sentence_vector['words'] = words sentence_vector['offsets'] = sentence_data['offsets'] sentence_vector['e_ids'] = idxs sentence_vector['tags'] = tagsIDs sentence_vector['etypes2'] = etypes2ids sentence_vector['toks2'] = toks2 sentence_vector['raw_words'] = sentence_data['words'] sentence_vector['entities'] = entities sentence_vector['sw_sentence'] = sw_sentence sentence_vector['terms'] = terms sentence_vector['sub_to_word'] = sub_to_word sentence_vector['subwords'] = subwords sentence_vector['valid_starts'] = valid_starts all_sentences.append(sentence_vector) return all_sentences
def torch_data_2_network(cdata2network, params, do_get_nn_data): """ Convert object-type data to torch.tensor type data, aim to use with Pytorch """ etypes = [data['etypes2'] for data in cdata2network] # nner entitiess = [data['entities'] for data in cdata2network] sw_sentences = [data['sw_sentence'] for data in cdata2network] termss = [data['terms'] for data in cdata2network] valid_startss = [data['valid_starts'] for data in cdata2network] fids = [data['fid'] for data in cdata2network] wordss = [data['words'] for data in cdata2network] offsetss = [data['offsets'] for data in cdata2network] sub_to_words = [data['sub_to_word'] for data in cdata2network] subwords = [data['subwords'] for data in cdata2network] tokenizer = BertTokenizer.from_pretrained(params['bert_model'], do_lower_case=False) # User-defined data if not params["predict"]: id_tag_mapping = params["mappings"]["nn_mapping"]["id_tag_mapping"] mlb = MultiLabelBinarizer() mlb.fit([sorted(id_tag_mapping)[1:]]) # [1:] skip label O params["mappings"]["nn_mapping"]["mlb"] = mlb params["mappings"]["nn_mapping"]["num_labels"] = len(mlb.classes_) params["max_span_width"] = max(params["max_entity_width"], params["max_trigger_width"]) params["mappings"]["nn_mapping"]["num_triggers"] = len( params["mappings"]["nn_mapping"]["trigger_labels"]) params["mappings"]["nn_mapping"]["num_entities"] = params["mappings"]["nn_mapping"]["num_labels"] - \ params["mappings"]["nn_mapping"]["num_triggers"] if do_get_nn_data: nn_data = get_nn_data(fids, entitiess, termss, valid_startss, sw_sentences, tokenizer, params) return { 'nn_data': nn_data, 'etypes': etypes, 'fids': fids, 'words': wordss, 'offsets': offsetss, 'sub_to_words': sub_to_words, 'subwords': subwords, 'entities': entitiess }
def make_dataset(path_list): tokenizer = BertTokenizer.from_pretrained('bert-base-chinese', do_lower_case=True) cvt = BertInputConverter(tokenizer) def process_file(path): l = [] with open(path, 'r', encoding='utf-8') as f: for line in tqdm(f.readlines()): l.extend(make_examples(json.loads(line.strip()), cvt)) return l examples = [] for path in path_list: examples.extend(process_file(path)) dataset = Dataset(examples, FIELDS) return dataset
def main(): ## data set_name = 'test' # 'train', 'dev' or 'test' raw_data_file = 'PAIRS_FILE' # contains query passage pairs, format: example_id \t query_text \t passage text (\t label) \n output_features_file = 'FEATURES_FILE' # format: example_id,input_ids,input_mask,segment_ids,label ## prepare tokenizer bert_model_dir = 'BERT_MODEL_DIR' # contains vocab.txt file. tokenizer = BertTokenizer.from_pretrained(bert_model_dir, do_lower_case=True) max_seq_length = 256 # start tokenize tokenize_to_features(set_name, raw_data_file, output_features_file, tokenizer, max_seq_length) logger.info('Convert to csv done!')
def get_data_iterators_tacred(train_lm=False, map_cpu=False): text_field = tt.data.Field(lower=False) label_field = tt.data.LabelField() length_field = tt.data.Field(sequential=False, use_vocab=False) offset_field = tt.data.Field(sequential=False, use_vocab=False) pos_field = tt.data.Field() ner_field = tt.data.Field() subj_offset_field = tt.data.Field() obj_offset_field = tt.data.Field() path_format = './.data/TACRED/data/json/%s.json' bert_tokenizer = None if args.use_bert_tokenizer: bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir='bert/cache') train_examples, dev_examples, test_examples = (get_examples_tacred(path_format % ds, train_lm, bert_tokenizer=bert_tokenizer) for ds in ['train', 'dev','test']) train, dev, test = (tt.data.Dataset(ex, [('text', text_field), ('length', length_field), ('offset', offset_field), ('label', label_field), ('subj_offset', subj_offset_field), ('obj_offset', obj_offset_field), ('ner', ner_field), ('pos', pos_field)]) for ex in [train_examples, dev_examples, test_examples]) vocab_path = 'vocab/vocab_tacred.pkl' if not args.use_bert_tokenizer else 'vocab/vocab_tacred_bert.pkl' if args.fix_test_vocab and not args.use_bert_tokenizer: vocab_path = 'vocab/vocab_tacred_fix.pkl' c_postfix = '.tacred' if args.use_bert_tokenizer: c_postfix += '.bert' if args.fix_test_vocab: c_postfix += '.fix' handle_vocab(vocab_path, text_field, (train, dev, test), args.vector_cache + c_postfix, train_lm, max_size=100000) handle_vocab(vocab_path, text_field, (train, dev, test), args.vector_cache + c_postfix, train_lm, max_size=100000) handle_vocab(vocab_path + '.relation', label_field, (train, dev, test), '', False, None) handle_vocab(vocab_path + '.subj_offset', subj_offset_field, (train, dev, test), '', False, None) handle_vocab(vocab_path + '.obj_offset', obj_offset_field, (train, dev, test), '', False, None) handle_vocab(vocab_path + '.pos', pos_field, (train, dev, test), '', False, None) handle_vocab(vocab_path + '.ner', ner_field, (train, dev, test), '', False, None) train_iter, dev_iter, test_iter = ( tt.data.BucketIterator(x, batch_size=args.batch_size, device=args.gpu if not map_cpu else 'cpu') for x in (train, dev, test)) return text_field, label_field, subj_offset_field, obj_offset_field, pos_field, ner_field, train_iter, dev_iter, test_iter, train, dev
def bertTokenizer(*args, **kwargs): """ Instantiate a BertTokenizer from a pre-trained/customized vocab file Args: pretrained_model_name_or_path: Path to pretrained model archive or one of pre-trained vocab configs below. * bert-base-uncased * bert-large-uncased * bert-base-cased * bert-large-cased * bert-base-multilingual-uncased * bert-base-multilingual-cased * bert-base-chinese Keyword args: cache_dir: an optional path to a specific directory to download and cache the pre-trained model weights. Default: None do_lower_case: Whether to lower case the input. Only has an effect when do_wordpiece_only=False Default: True do_basic_tokenize: Whether to do basic tokenization before wordpiece. Default: True max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the minimum of this value (if specified) and the underlying BERT model's sequence length. Default: None never_split: List of tokens which will never be split during tokenization. Only has an effect when do_wordpiece_only=False Default: ["[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"] Example: >>> sentence = 'Hello, World!' >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT:hubconf', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False, force_reload=False) >>> toks = tokenizer.tokenize(sentence) ['Hello', '##,', 'World', '##!'] >>> ids = tokenizer.convert_tokens_to_ids(toks) [8667, 28136, 1291, 28125] """ tokenizer = BertTokenizer.from_pretrained(*args, **kwargs) return tokenizer
def test_2(): datapath = './data/demo/devset/search.dev.json' stg = sample_strategy_factory('trivial_n',k=1) examples,labels = load_examples_from_scratch(datapath,stg) #examples,labels = load_examples_from_scratch(datapath,attach_label='most_related_para') #for (q,p),label in zip(examples[0:20],labels[0:20]): # print(q) # print(p[0:50]) # print(label) # print('##'*10) # #print(len(examples)) #print(examples[0:10]) #print(labels[0:10]) examples = load_examples_from_scratch(datapath,None) #print(len(examples)) examples = load_examples_from_scratch(datapath,stg,concat=True) BERT_SERIALIZATION_DIR = './pretrained/chinese_wwm_ext_pytorch' tokenizer = BertTokenizer('%s/vocab.txt'%(BERT_SERIALIZATION_DIR)) device = torch.device('cpu') num_fn = functools.partial(generate_bert_pointwise_input,max_seq_len=200,max_passage_len=100,tokenizer=tokenizer,device=device) fake_examples = [('你好嗎','歐巴馬撿到了300快'),('我不好啦','歐巴馬撿到了槍 高雄發大財了'),('哈哈哈','猜猜我是誰')] X = generate_bert_pointwise_input(fake_examples,20,7,tokenizer,device) for a,b,c in X: print('%d'%(a.shape)) print('- - - '*18) #print(X) #print(examples[0:2]) bt = BatchIter(examples,16,num_fn) for batch,y in bt: print(batch[0][0].shape) print(batch[1][1].shape) print(y.shape)
def __init__(self, sample_list, bert_path, max_passage_len, max_seq_length, device=None): super(BertRankDataset, self).__init__(sample_list, device) self.add_bert_fields() self.tokenizer = BertTokenizer('%s/vocab.txt' % (bert_path)) self.max_seq_length = max_seq_length self.max_passage_len = max_passage_len #_num_fn = numeralize_fucntion_factory(config.NUM_FN_NAME) self.numeralize_fn = functools.partial(generate_bert_pointwise_input,max_seq_len=self.max_seq_length,max_passage_len=self.max_passage_len,\ tokenizer=self.tokenizer,device=self.device,wrap_tensor_flag=False) examples = [(sample['question'], sample['passage']) for sample in self.sample_list] bert_input_t, seg_ids_t, input_mask_t = self.numeralize_fn(examples) for i, sample in enumerate(self.sample_list): sample.update({ 'input_ids': bert_input_t[i], 'input_mask': input_mask_t[i], 'segment_ids': seg_ids_t[i] })
def __init__(self, config, decoder_dict=None, eval_flag=True, device=None): self.config = config if device is None: self.device = get_default_device() bert_config_path = '%s/bert_config.json' % ( config.BERT_SERIALIZATION_DIR) self.model = load_bert_rc_model(bert_config_path, config.MODEL_PATH, self.device) self.model.load_state_dict( torch.load(config.MODEL_PATH, map_location=self.device)) self.model = self.model.to(self.device) if eval_flag: self.model.eval() #bert-base-chinese self.tokenizer = BertTokenizer('%s/vocab.txt' % (config.BERT_SERIALIZATION_DIR), do_lower_case=True) if decoder_dict is None: self.decoder = MrcDecoderFactory.from_dict({ 'class': 'default', 'kwargs': {} }) else: self.decoder = MrcDecoderFactory.from_dict(decoder_dict)
# for y in type2son[x]: # t[type2id[y]] = alpha # prior_numpy[:, type2id[x]] = t # return prior_numpy # #prior = torch.from_numpy(create_prior()) #tune = torch.from_numpy(np.transpose(create_prior(args.hierarchy_alpha))) logger.info('load bert and ernie tokenizer') ernie_tokenizer_label = ErnieTokenizer_label.from_pretrained( 'ernie_base/', do_lower_case=args.bert_low_case) ernie_tokenizer = ErnieTokenizer.from_pretrained( 'ernie_base/', do_lower_case=args.bert_low_case) bert_tokenizer = BertTokenizer.from_pretrained( 'bert_large/', do_lower_case=args.bert_low_case) # dataset for open type # left context token + mention_span + right_context_token class OpenDataset(data.Dataset): def __init__(self, path): entries = open(path, 'r').read().strip().splitlines() self.left_context, self.right_context, self.mention_span, self.labels = [], [], [], [] def trans(x): return x[x.rfind('/') + 1:] for entry in entries: entry = dict(eval(entry)) ys = entry['y_str']
def main(args): if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') if args.fp16: logger.info( "16-bits training currently not supported in distributed training" ) args.fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496) logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval and not args.do_predict: raise ValueError( "At least one of `do_train` or `do_eval` or `do_predict` must be True." ) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): #raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) logger.warn('Output directory {} already exists.'.format( args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) task_name = args.task_name.lower() tokenizer = BertTokenizer(args.vocab, do_lower_case=args.do_lower_case) dprd_task = DPRDTask(tokenizer) eval_data = dprd_task.get_dev_dataset(args.data_dir, 128, input_type=args.tasks) if args.wnli_data: wnli_task = WNLITask(tokenizer) wnli_data = wnli_task.get_dev_dataset(args.wnli_data, 128, input_type=args.tasks) eval_data += wnli_data if args.wsc273_data: wsc273_task = WSC273Task(tokenizer) wsc273_data = wsc273_task.get_dev_dataset(args.wsc273_data, 128, input_type=args.tasks) eval_data += wsc273_data if args.gap_data: gap_task = GAPTask(tokenizer) gap_data = gap_task.get_dev_dataset(args.gap_data, 384, input_type=args.tasks) eval_data += gap_data logger.info(" Evaluation batch size = %d", args.eval_batch_size) train_examples = None num_train_steps = None if args.do_train: train_data = build_training_data_mt(args, tokenizer) total_examples = len(train_data) num_train_steps = int(total_examples / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) logger.info(" Training batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) model = create_model(args, 2, device) if args.do_train: train_model(args, device, n_gpu, model, train_data, eval_data, num_train_steps) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): run_eval(args, model.eval(), device, eval_data, prefix=args.tag) if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0): test_data = dprd_task.get_test_dataset(args.data_dir, 128, input_type=args.tasks) if args.wnli_data: wnli_data = wnli_task.get_test_dataset(args.wnli_data, 128, input_type=args.tasks) test_data += wnli_data if args.wsc273_data: wsc273_data = wsc273_task.get_test_dataset(args.wsc273_data, 128, input_type=args.tasks) test_data += wsc273_data logger.info(" Prediction batch size = %d", args.predict_batch_size) run_predict(args, model, device, test_data, prefix=args.tag)
import json import pickle from model.utils import Vocab from bert.tokenization import BertTokenizer with open('experiment/config.json') as f: params = json.loads(f.read()) # loading BertTokenizer ptr_tokenizer = BertTokenizer.from_pretrained('bert/vocab.korean.rawtext.list', do_lower_case=False) idx_to_token = list(ptr_tokenizer.vocab.keys()) # generate vocab token_vocab = Vocab(idx_to_token, padding_token='[PAD]', unknown_token='[UNK]', bos_token=None, eos_token=None, reserved_tokens=['[CLS]', '[SEP]', '[MASK]'], unknown_token_idx=1) # save vocab token_vocab_path = params['filepath'].get('token_vocab') with open(token_vocab_path, 'wb') as f: pickle.dump(token_vocab, f)
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument( "--output_dir", default=None, type=str, help="The output directory where the model checkpoints will be written." ) parser.add_argument("--train_file", default=None, type=str) parser.add_argument("--val_file", default=None, type=str) parser.add_argument("--test_file", default=None, type=str) parser.add_argument("--test_output", default=None, type=str) parser.add_argument("--label_vocab", default=None, type=str, required=True) parser.add_argument("--punc_set", default='PU', type=str) parser.add_argument("--has_confidence", action='store_true') parser.add_argument("--only_save_bert", action='store_true') parser.add_argument("--arc_space", default=512, type=int) parser.add_argument("--type_space", default=128, type=int) parser.add_argument("--log_file", default=None, type=str) ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_predict", action='store_true', help="Whether to run predict on the test set.") parser.add_argument("--do_greedy_predict", action='store_true', help="Whether to run predict on the test set.") parser.add_argument("--do_ensemble_predict", action='store_true', help="Whether to run predict on the test set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--test_batch_size", default=8, type=int, help="Total batch size for test.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() if args.log_file is None: logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) else: logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', filename=args.log_file, filemode='w', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_predict and not args.do_greedy_predict and not args.do_ensemble_predict: raise ValueError( "At least one of `do_train` or `do_predict` must be True.") if args.do_train: assert args.output_dir is not None if args.do_train and os.path.exists(args.output_dir) and os.listdir( args.output_dir): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if args.do_train and not os.path.exists(args.output_dir): os.makedirs(args.output_dir) label_vocab, label_vocab2idx = load_label_vocab(args.label_vocab) punc_set = set( args.punc_set.split(',')) if args.punc_set is not None else None train_examples = None num_train_optimization_steps = None if args.do_train: assert args.train_file is not None train_examples = read_conll_examples( args.train_file, is_training=True, has_confidence=args.has_confidence) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) if args.do_train or args.do_predict or args.do_greedy_predict: # load the pretrained model tokenizer = BertTokenizer.from_pretrained( args.bert_model, do_lower_case=args.do_lower_case) model = BertForDependencyParsing.from_pretrained( args.bert_model, cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)), arc_space=args.arc_space, type_space=args.type_space, num_labels=len(label_vocab)) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # parser = model.module if hasattr(model, 'module') else model elif args.do_ensemble_predict: bert_models = args.bert_model.split(',') assert len(bert_models) > 1 tokenizer = BertTokenizer.from_pretrained( bert_models[0], do_lower_case=args.do_lower_case) models = [] for bm in bert_models: model = BertForDependencyParsing.from_pretrained( bm, cache_dir=os.path.join( str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)), arc_space=args.arc_space, type_space=args.type_space, num_labels=len(label_vocab)) model.to(device) model.eval() models.append(model) parser = models[0].module if hasattr(models[0], 'module') else models[0] # Prepare optimizer if args.do_train: param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex # !!! NOTE why? param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) warmup_linear = WarmupLinearSchedule( warmup=args.warmup_proportion, t_total=num_train_optimization_steps) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) # start training loop if args.do_train: global_step = 0 train_features = convert_examples_to_features( train_examples, tokenizer, args.max_seq_length, label_vocab2idx, True, has_confidence=args.has_confidence) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.float32) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_lengths = torch.tensor([f.seq_len for f in train_features], dtype=torch.long) all_heads = torch.tensor([f.heads for f in train_features], dtype=torch.long) all_labels = torch.tensor([f.labels for f in train_features], dtype=torch.long) if args.has_confidence: all_confidence = torch.tensor( [f.confidence for f in train_features], dtype=torch.float32) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_lengths, all_heads, all_labels, all_confidence) else: train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_lengths, all_heads, all_labels) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) if args.do_eval: assert args.val_file is not None eval_examples = read_conll_examples(args.val_file, is_training=False, has_confidence=False) eval_features = convert_examples_to_features(eval_examples, tokenizer, args.max_seq_length, label_vocab2idx, False, has_confidence=False) logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_example_ids = torch.tensor( [f.example_id for f in eval_features], dtype=torch.long) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in eval_features], dtype=torch.float32) all_segment_ids = torch.tensor( [f.segment_ids for f in eval_features], dtype=torch.long) all_lengths = torch.tensor([f.seq_len for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_lengths, all_example_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) best_uas = 0 best_las = 0 for epoch in trange(int(args.num_train_epochs), desc="Epoch"): logger.info("Training epoch: {}".format(epoch)) tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 model.train() for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) if args.has_confidence: input_ids, input_mask, segment_ids, lengths, heads, label_ids, confidence = batch else: confidence = None input_ids, input_mask, segment_ids, lengths, heads, label_ids = batch loss = model(input_ids, segment_ids, input_mask, heads, label_ids, confidence) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if args.fp16: optimizer.backward(loss) else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if global_step % 100 == 0: logger.info("Training loss: {}, global step: {}".format( tr_loss / nb_tr_steps, global_step)) # we eval every epoch if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): logger.info("***** Running evaluation *****") model.eval() eval_predict_words, eval_predict_postags, eval_predict_heads, eval_predict_labels = [],[],[],[] for input_ids, input_mask, segment_ids, lengths, example_ids in tqdm( eval_dataloader, desc="Evaluating"): example_ids = example_ids.numpy() batch_words = [ eval_features[eid].example.sentence for eid in example_ids ] batch_postags = [ eval_features[eid].example.postags for eid in example_ids ] batch_word_index = [ eval_features[eid].word_index for eid in example_ids ] # token -> word batch_token_starts = [ eval_features[eid].token_starts for eid in example_ids ] # word -> token start batch_heads = [ eval_features[eid].example.heads for eid in example_ids ] input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) heads = heads.to(device) label_ids = label_ids.to(device) with torch.no_grad(): # tmp_eval_loss = model(input_ids, segment_ids, input_mask, heads, label_ids) energy = model(input_ids, segment_ids, input_mask) heads_pred, labels_pred = parser.decode_MST( energy.cpu().numpy(), lengths.numpy(), leading_symbolic=0, labeled=True) # we convert the subword dependency parsing to word dependency parsing just the word and token start map pred_heads = [] pred_labels = [] for i in range(len(batch_word_index)): word_index = batch_word_index[i] token_starts = batch_token_starts[i] hpd = [] lpd = [] for j in range(len(token_starts)): if j == 0: #[CLS] continue elif j == len(token_starts) - 1: # [SEP] continue else: hpd.append( word_index[heads_pred[i, token_starts[j]]]) lpd.append( label_vocab[labels_pred[i, token_starts[j]]]) pred_heads.append(hpd) pred_labels.append(lpd) eval_predict_words += batch_words eval_predict_postags += batch_postags eval_predict_heads += pred_heads eval_predict_labels += pred_labels eval_output_file = os.path.join(args.output_dir, 'eval.pred') write_conll_examples(eval_predict_words, eval_predict_postags, eval_predict_heads, eval_predict_labels, eval_output_file) eval_f = os.popen( "python scripts/eval_nlpcc_dp.py " + args.val_file + " " + eval_output_file, "r") result_text = eval_f.read().strip() logger.info("***** Eval results *****") logger.info(result_text) eval_f.close() eval_res = re.findall( r'UAS = \d+/\d+ = ([\d\.]+), LAS = \d+/\d+ = ([\d\.]+)', result_text) assert len(eval_res) > 0 eval_res = eval_res[0] eval_uas = float(eval_res[0]) eval_las = float(eval_res[1]) # save model if best_las < eval_las or (eval_las == best_las and best_uas < eval_uas): best_uas = eval_uas best_las = eval_las logger.info( "new best uas %.2f%% las %.2f%%, saving models.", best_uas, best_las) # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) model_dict = model_to_save.state_dict() if args.only_save_bert: model_dict = { k: v for k, v in model_dict.items() if 'bert.' in k } torch.save(model_dict, output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # start predict if args.do_predict: model.eval() assert args.test_file is not None test_examples = read_conll_examples(args.test_file, is_training=False, has_confidence=False) test_features = convert_examples_to_features(test_examples, tokenizer, args.max_seq_length, label_vocab2idx, False, has_confidence=False) logger.info("***** Running prediction *****") logger.info(" Num examples = %d", len(test_examples)) logger.info(" Batch size = %d", args.test_batch_size) all_example_ids = torch.tensor([f.example_id for f in test_features], dtype=torch.long) all_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in test_features], dtype=torch.float32) all_segment_ids = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long) all_lengths = torch.tensor([f.seq_len for f in test_features], dtype=torch.long) test_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_lengths, all_example_ids) # Run prediction for full data test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.test_batch_size) test_predict_words, test_predict_postags, test_predict_heads, test_predict_labels = [],[],[],[] for batch_id, batch in enumerate( tqdm(test_dataloader, desc="Predicting")): input_ids, input_mask, segment_ids, lengths, example_ids = batch example_ids = example_ids.numpy() batch_words = [ test_features[eid].example.sentence for eid in example_ids ] batch_postags = [ test_features[eid].example.postags for eid in example_ids ] batch_word_index = [ test_features[eid].word_index for eid in example_ids ] # token -> word batch_token_starts = [ test_features[eid].token_starts for eid in example_ids ] # word -> token start input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) lengths = lengths.numpy() with torch.no_grad(): energy = model(input_ids, segment_ids, input_mask) heads_pred, labels_pred = parser.decode_MST(energy.cpu().numpy(), lengths, leading_symbolic=0, labeled=True) pred_heads = [] pred_labels = [] for i in range(len(batch_word_index)): word_index = batch_word_index[i] token_starts = batch_token_starts[i] hpd = [] lpd = [] for j in range(len(token_starts)): if j == 0: #[CLS] continue elif j == len(token_starts) - 1: # [SEP] continue else: hpd.append(word_index[heads_pred[i, token_starts[j]]]) lpd.append(label_vocab[labels_pred[i, token_starts[j]]]) pred_heads.append(hpd) pred_labels.append(lpd) test_predict_words += batch_words test_predict_postags += batch_postags test_predict_heads += pred_heads test_predict_labels += pred_labels assert args.test_output is not None write_conll_examples(test_predict_words, test_predict_postags, test_predict_heads, test_predict_labels, args.test_output) if args.do_greedy_predict: model.eval() assert args.test_file is not None test_examples = read_conll_examples(args.test_file, is_training=False, has_confidence=False) test_features = convert_examples_to_features(test_examples, tokenizer, args.max_seq_length, label_vocab2idx, False, has_confidence=False) logger.info("***** Running prediction *****") logger.info(" Num examples = %d", len(test_examples)) logger.info(" Batch size = %d", args.test_batch_size) all_example_ids = torch.tensor([f.example_id for f in test_features], dtype=torch.long) all_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in test_features], dtype=torch.float32) all_segment_ids = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long) all_lengths = torch.tensor([f.seq_len for f in test_features], dtype=torch.long) test_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_lengths, all_example_ids) # Run prediction for full data test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.test_batch_size) test_predict_words, test_predict_postags, test_predict_heads, test_predict_labels = [],[],[],[] for batch_id, batch in enumerate( tqdm(test_dataloader, desc="Predicting")): input_ids, input_mask, segment_ids, lengths, example_ids = batch example_ids = example_ids.numpy() batch_words = [ test_features[eid].example.sentence for eid in example_ids ] batch_postags = [ test_features[eid].example.postags for eid in example_ids ] batch_word_index = [ test_features[eid].word_index for eid in example_ids ] # token -> word batch_token_starts = [ test_features[eid].token_starts for eid in example_ids ] # word -> token start input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) lengths = lengths.numpy() with torch.no_grad(): heads_pred, labels_pred = model(input_ids, segment_ids, input_mask, greedy_inference=True) pred_heads = [] pred_labels = [] for i in range(len(batch_word_index)): word_index = batch_word_index[i] token_starts = batch_token_starts[i] hpd = [] lpd = [] for j in range(len(token_starts)): if j == 0: #[CLS] continue elif j == len(token_starts) - 1: # [SEP] continue else: hpd.append(word_index[heads_pred[i, token_starts[j]]]) lpd.append(label_vocab[labels_pred[i, token_starts[j]]]) pred_heads.append(hpd) pred_labels.append(lpd) test_predict_words += batch_words test_predict_postags += batch_postags test_predict_heads += pred_heads test_predict_labels += pred_labels assert args.test_output is not None write_conll_examples(test_predict_words, test_predict_postags, test_predict_heads, test_predict_labels, args.test_output) if args.do_ensemble_predict: assert args.test_file is not None test_examples = read_conll_examples(args.test_file, is_training=False, has_confidence=False) test_features = convert_examples_to_features(test_examples, tokenizer, args.max_seq_length, label_vocab2idx, False, has_confidence=False) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(test_examples)) logger.info(" Batch size = %d", args.test_batch_size) all_example_ids = torch.tensor([f.example_id for f in test_features], dtype=torch.long) all_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in test_features], dtype=torch.float32) all_segment_ids = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long) all_lengths = torch.tensor([f.seq_len for f in test_features], dtype=torch.long) test_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_lengths, all_example_ids) # Run prediction for full data test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.test_batch_size) test_predict_words, test_predict_postags, test_predict_heads, test_predict_labels = [],[],[],[] for batch_id, batch in enumerate( tqdm(test_dataloader, desc="Predicting")): input_ids, input_mask, segment_ids, lengths, example_ids = batch example_ids = example_ids.numpy() batch_words = [ test_features[eid].example.sentence for eid in example_ids ] batch_postags = [ test_features[eid].example.postags for eid in example_ids ] batch_word_index = [ test_features[eid].word_index for eid in example_ids ] # token -> word batch_token_starts = [ test_features[eid].token_starts for eid in example_ids ] # word -> token start input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) lengths = lengths.numpy() with torch.no_grad(): energy_sum = None for model in models: energy = model(input_ids, segment_ids, input_mask) if energy_sum is None: energy_sum = energy else: energy_sum = energy_sum + energy energy_sum = energy_sum / len(models) heads_pred, labels_pred = parser.decode_MST( energy_sum.cpu().numpy(), lengths, leading_symbolic=0, labeled=True) pred_heads = [] pred_labels = [] for i in range(len(batch_word_index)): word_index = batch_word_index[i] token_starts = batch_token_starts[i] hpd = [] lpd = [] for j in range(len(token_starts)): if j == 0: #[CLS] continue elif j == len(token_starts) - 1: # [SEP] continue else: hpd.append(word_index[heads_pred[i, token_starts[j]]]) lpd.append(label_vocab[labels_pred[i, token_starts[j]]]) pred_heads.append(hpd) pred_labels.append(lpd) test_predict_words += batch_words test_predict_postags += batch_postags test_predict_heads += pred_heads test_predict_labels += pred_labels assert args.test_output is not None write_conll_examples(test_predict_words, test_predict_postags, test_predict_heads, test_predict_labels, args.test_output)
extra_features=extra_features) for sidx, (train_data, dev_data) in enumerate(mediqa_split_data): mediqa_train_fout = os.path.join(mt_dnn_root, 'mediqa_{}_train.json'.format(sidx)) mediqa_dev_fout = os.path.join(mt_dnn_root, 'mediqa_{}_dev.json'.format(sidx)) build_data(train_data, mediqa_train_fout, extra_features=extra_features) build_data(dev_data, mediqa_dev_fout, extra_features=extra_features) logger.info('done with mediqa') medquad_train_fout = os.path.join(mt_dnn_root, 'medquad_train.json') medquad_dev_fout = os.path.join(mt_dnn_root, 'medquad_dev.json') build_data(medquad_train_data, medquad_train_fout) build_data(medquad_dev_data, medquad_dev_fout) logger.info('done with medquad') if __name__ == '__main__': args = parse_args() if args.sci_vocab: # default to uncased bert_tokenizer = BertTokenizer.from_pretrained( '../bert_models/scibert_scivocab_uncased/vocab.txt') elif args.cased: bert_tokenizer = BertTokenizer.from_pretrained('bert-large-cased') else: bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') main(args)
def main(*_, **kwargs): use_cuda = torch.cuda.is_available() and kwargs["device"] >= 0 device = torch.device("cuda:" + str(kwargs["device"]) if use_cuda else "cpu") if use_cuda: torch.cuda.set_device(device) kwargs["use_cuda"] = use_cuda neptune.create_experiment( name="bert-span-parser", upload_source_files=[], params={ k: str(v) if isinstance(v, bool) else v for k, v in kwargs.items() }, ) logger.info("Settings: {}", json.dumps(kwargs, indent=2, ensure_ascii=False)) # For reproducibility os.environ["PYTHONHASHSEED"] = str(kwargs["seed"]) random.seed(kwargs["seed"]) np.random.seed(kwargs["seed"]) torch.manual_seed(kwargs["seed"]) torch.cuda.manual_seed_all(kwargs["seed"]) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False # Prepare and load data tokenizer = BertTokenizer.from_pretrained(kwargs["bert_model"], do_lower_case=False) logger.info("Loading data...") train_treebank = load_trees(kwargs["train_file"]) dev_treebank = load_trees(kwargs["dev_file"]) test_treebank = load_trees(kwargs["test_file"]) logger.info( "Loaded {:,} train, {:,} dev, and {:,} test examples!", len(train_treebank), len(dev_treebank), len(test_treebank), ) logger.info("Preprocessing data...") train_parse = [tree.convert() for tree in train_treebank] train_sentences = [[(leaf.tag, leaf.word) for leaf in tree.leaves()] for tree in train_parse] dev_sentences = [[(leaf.tag, leaf.word) for leaf in tree.leaves()] for tree in dev_treebank] test_sentences = [[(leaf.tag, leaf.word) for leaf in tree.leaves()] for tree in test_treebank] logger.info("Data preprocessed!") logger.info("Preparing data for training...") tags = [] labels = [] for tree in train_parse: nodes = [tree] while nodes: node = nodes.pop() if isinstance(node, InternalParseNode): labels.append(node.label) nodes.extend(reversed(node.children)) else: tags.append(node.tag) tag_encoder = LabelEncoder() tag_encoder.fit(tags, reserved_labels=["[PAD]", "[UNK]"]) label_encoder = LabelEncoder() label_encoder.fit(labels, reserved_labels=[()]) logger.info("Data prepared!") # Settings num_train_optimization_steps = kwargs["num_epochs"] * ( (len(train_parse) - 1) // kwargs["batch_size"] + 1) kwargs["batch_size"] //= kwargs["gradient_accumulation_steps"] logger.info("Creating dataloaders for training...") train_dataloader, train_features = create_dataloader( sentences=train_sentences, batch_size=kwargs["batch_size"], tag_encoder=tag_encoder, tokenizer=tokenizer, is_eval=False, ) dev_dataloader, dev_features = create_dataloader( sentences=dev_sentences, batch_size=kwargs["batch_size"], tag_encoder=tag_encoder, tokenizer=tokenizer, is_eval=True, ) test_dataloader, test_features = create_dataloader( sentences=test_sentences, batch_size=kwargs["batch_size"], tag_encoder=tag_encoder, tokenizer=tokenizer, is_eval=True, ) logger.info("Dataloaders created!") # Initialize model model = ChartParser.from_pretrained( kwargs["bert_model"], tag_encoder=tag_encoder, label_encoder=label_encoder, lstm_layers=kwargs["lstm_layers"], lstm_dim=kwargs["lstm_dim"], tag_embedding_dim=kwargs["tag_embedding_dim"], label_hidden_dim=kwargs["label_hidden_dim"], dropout_prob=kwargs["dropout_prob"], ) model.to(device) # Prepare optimizer param_optimizers = list(model.named_parameters()) if kwargs["freeze_bert"]: for p in model.bert.parameters(): p.requires_grad = False param_optimizers = [(n, p) for n, p in param_optimizers if p.requires_grad] # Hack to remove pooler, which is not used thus it produce None grad that break apex param_optimizers = [n for n in param_optimizers if "pooler" not in n[0]] no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in param_optimizers if not any(nd in n for nd in no_decay) ], "weight_decay": 0.01, }, { "params": [ p for n, p in param_optimizers if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = BertAdam( optimizer_grouped_parameters, lr=kwargs["learning_rate"], warmup=kwargs["warmup_proportion"], t_total=num_train_optimization_steps, ) if kwargs["fp16"]: model, optimizer = amp.initialize(model, optimizer, opt_level="O1") pretrained_model_file = os.path.join(kwargs["output_dir"], MODEL_FILENAME) if kwargs["do_eval"]: assert os.path.isfile( pretrained_model_file), "Pretrained model file does not exist!" logger.info("Loading pretrained model from {}", pretrained_model_file) # Load model from file params = torch.load(pretrained_model_file, map_location=device) model.load_state_dict(params["model"]) logger.info( "Loaded pretrained model (Epoch: {:,}, Fscore: {:.2f})", params["epoch"], params["fscore"], ) eval_score = eval( model=model, eval_dataloader=test_dataloader, eval_features=test_features, eval_trees=test_treebank, eval_sentences=test_sentences, tag_encoder=tag_encoder, device=device, ) neptune.send_metric("test_eval_precision", eval_score.precision()) neptune.send_metric("test_eval_recall", eval_score.recall()) neptune.send_metric("test_eval_fscore", eval_score.fscore()) tqdm.write("Evaluation score: {}".format(str(eval_score))) else: # Training phase global_steps = 0 start_epoch = 0 best_dev_fscore = 0 if kwargs["preload"] or kwargs["resume"]: assert os.path.isfile( pretrained_model_file), "Pretrained model file does not exist!" logger.info("Resuming model from {}", pretrained_model_file) # Load model from file params = torch.load(pretrained_model_file, map_location=device) model.load_state_dict(params["model"]) if kwargs["resume"]: optimizer.load_state_dict(params["optimizer"]) torch.cuda.set_rng_state_all([ state.cpu() for state in params["torch_cuda_random_state_all"] ]) torch.set_rng_state(params["torch_random_state"].cpu()) np.random.set_state(params["np_random_state"]) random.setstate(params["random_state"]) global_steps = params["global_steps"] start_epoch = params["epoch"] + 1 best_dev_fscore = params["fscore"] else: assert not os.path.isfile( pretrained_model_file ), "Please remove or move the pretrained model file to another place!" for epoch in trange(start_epoch, kwargs["num_epochs"], desc="Epoch"): model.train() train_loss = 0 num_train_steps = 0 for step, (indices, *_) in enumerate( tqdm(train_dataloader, desc="Iteration")): ids, attention_masks, tags, sections, trees, sentences = prepare_batch_input( indices=indices, features=train_features, trees=train_parse, sentences=train_sentences, tag_encoder=tag_encoder, device=device, ) loss = model( ids=ids, attention_masks=attention_masks, tags=tags, sections=sections, sentences=sentences, gold_trees=trees, ) if kwargs["gradient_accumulation_steps"] > 1: loss /= kwargs["gradient_accumulation_steps"] if kwargs["fp16"]: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() train_loss += loss.item() num_train_steps += 1 if (step + 1) % kwargs["gradient_accumulation_steps"] == 0: optimizer.step() optimizer.zero_grad() global_steps += 1 # Write logs neptune.send_metric("train_loss", epoch, train_loss / num_train_steps) neptune.send_metric("global_steps", epoch, global_steps) tqdm.write( "Epoch: {:,} - Train loss: {:.4f} - Global steps: {:,}".format( epoch, train_loss / num_train_steps, global_steps)) # Evaluate eval_score = eval( model=model, eval_dataloader=dev_dataloader, eval_features=dev_features, eval_trees=dev_treebank, eval_sentences=dev_sentences, tag_encoder=tag_encoder, device=device, ) neptune.send_metric("eval_precision", epoch, eval_score.precision()) neptune.send_metric("eval_recall", epoch, eval_score.recall()) neptune.send_metric("eval_fscore", epoch, eval_score.fscore()) tqdm.write("Epoch: {:,} - Evaluation score: {}".format( epoch, str(eval_score))) # Save best model if eval_score.fscore() > best_dev_fscore: best_dev_fscore = eval_score.fscore() tqdm.write("** Saving model...") os.makedirs(kwargs["output_dir"], exist_ok=True) torch.save( { "epoch": epoch, "global_steps": global_steps, "fscore": best_dev_fscore, "random_state": random.getstate(), "np_random_state": np.random.get_state(), "torch_random_state": torch.get_rng_state(), "torch_cuda_random_state_all": torch.cuda.get_rng_state_all(), "optimizer": optimizer.state_dict(), "model": (model.module if hasattr(model, "module") else model).state_dict(), }, pretrained_model_file, ) tqdm.write( "** Best evaluation fscore: {:.2f}".format(best_dev_fscore))
seed = args.seed random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) if n_gpu > 1: torch.cuda.manual_seed_all(args.seed) ## Import proper model if args.is_sequence_labeling: from model import BertForSequenceLabeling as Model else: from model import BertForSequenceClassification as Model logging.info('Building tokenizer...') tokenizer = BertTokenizer(args.pretrained_weights_dir+'vocab.txt') logging.info('Loading data...') data = Data(args.task_name, args.data_dir, tokenizer, args.max_seq_len, args.is_sequence_labeling) logging.info('Building Model...') model = Model.from_pretrained(args.pretrained_weights_dir, data.label_size) model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [
def online_test_coref(config, input_text): """ 输入一段文本,进行指代消解任务 :param config: 配置参数 :return: None """ def create_example(text): """将文字转为模型需要的样例格式""" sentences = [['[CLS]'] + tokenizer.tokenize_not_UNK(text) + ['[SEP]']] sentence_map = [0] * len(sentences[0]) speakers = [["-" for _ in sentence] for sentence in sentences] subtoken_map = [i for i in range(len(sentences[0]))] return { "doc_key": "bn", "clusters": [], "sentences": sentences, "speakers": speakers, 'sentence_map': sentence_map, 'subtoken_map': subtoken_map } tokenizer = BertTokenizer.from_pretrained(config['vocab_file'], do_lower_case=True) online_coref_output_file = config['online_output_path'] example = create_example(input_text) model = CorefModel.from_pretrained(config["model_save_path"], coref_task_config=config) model.to(device) model.eval() with open(online_coref_output_file, 'w', encoding="utf-8") as output_file: with torch.no_grad(): tensorized_example = model.tensorize_example(example, is_training=False) input_ids = torch.from_numpy( tensorized_example[0]).long().to(device) input_mask = torch.from_numpy( tensorized_example[1]).long().to(device) text_len = torch.from_numpy( tensorized_example[2]).long().to(device) speaker_ids = torch.from_numpy( tensorized_example[3]).long().to(device) genre = torch.tensor(tensorized_example[4]).long().to(device) is_training = tensorized_example[5] gold_starts = torch.from_numpy( tensorized_example[6]).long().to(device) gold_ends = torch.from_numpy( tensorized_example[7]).long().to(device) cluster_ids = torch.from_numpy( tensorized_example[8]).long().to(device) sentence_map = torch.Tensor( tensorized_example[9]).long().to(device) (_, _, _, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores), _ = \ model(input_ids, input_mask, text_len, speaker_ids, genre, is_training, gold_starts, gold_ends, cluster_ids, sentence_map) predicted_antecedents = model.get_predicted_antecedents( top_antecedents.cpu(), top_antecedent_scores.cpu()) # 预测实体索引 example["predicted_clusters"], _ = model.get_predicted_clusters( top_span_starts, top_span_ends, predicted_antecedents) # 索引——>文字 example_sentence = utils.flatten(example["sentences"]) predicted_list = [] for same_entity in example["predicted_clusters"]: same_entity_list = [] num_same_entity = len(same_entity) for index in range(num_same_entity): entity_name = ''.join(example_sentence[ same_entity[index][0]:same_entity[index][1] + 1]) same_entity_list.append(entity_name) predicted_list.append(same_entity_list) same_entity_list = [] # 清空list example["predicted_idx2entity"] = predicted_list example["top_spans"] = list( zip((int(i) for i in top_span_starts), (int(i) for i in top_span_ends))) example['head_scores'] = [] output_file.write(json.dumps(example, ensure_ascii=False)) output_file.write("\n")
from utils.args import get_args import torch import numpy as np from utils.reader import load_vocab from bert.tokenization import BertTokenizer from utils.parser import get_span_to_node_mapping, parse_tree import csv, pickle from collections import defaultdict from utils.args import get_best_snapshot from nns.linear_model import BOWRegression, BOWRegressionMulti import argparse args = get_args() tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir='bert/cache') def unigram_linear_pearson(filename): f = open(filename) model = torch.load(args.bow_snapshot, map_location='cpu') vocab = load_vocab(VOCAB) out, truth = [], [] coeff_dict = {} scores_dict = defaultdict(list) valid, total = 0, 0 for lidx, line in enumerate(f.readlines()): if lidx < MINLINE: continue if lidx == MAXLINE: break l = line.lower().strip().split('\t') for entry in l: items = entry.strip().split(' ') if len(items) > 2: continue
self.dev_data = DataProcessor(path + 'dev.tsv', loader.load_dev_data, loader.label_map, tokenizer, max_seq_len, is_sequence_labeling) logging.info('Demo test data') self.test_data = DataProcessor(path + 'test.tsv', loader.load_test_data, loader.label_map, tokenizer, max_seq_len, is_sequence_labeling) self.label_size = len(loader.label_map) self.label_map = loader.label_map self.reverse_label_map = loader.reverse_label_map if __name__ == '__main__': logging.info('Building tokenizer...') tokenizer = BertTokenizer('pretrained_weights/vocab.txt') logging.info('Loading data...') path = './data/CoLA/' data = Data('cola', path, tokenizer) logging.info('Loading data...') path = './data/MRPC/' data = Data('mrpc', path, tokenizer) logging.info('Loading data...') path = './data/NER/' data = Data('sequencelabeling', path, tokenizer, is_sequence_labeling=True)
'att_mask': torch.LongTensor(input_mask) } return { 'question': question, 'passage': passage, 'input': torch.LongTensor(input_ids).unsqueeze(0), 'seg': torch.LongTensor(segment_ids).unsqueeze(0), 'att_mask': torch.LongTensor(input_mask).unsqueeze(0) } if __name__ == "__main__": print('load model') model = load_bert_model() print('convert') tokenizer = BertTokenizer.from_pretrained('bert-base-chinese', do_lower_case=True) cvt = BertInputConverter(tokenizer) examples = read_from_demo_txt_file() for question, passage in examples: sample = cvt.convert(question, passage, args.max_query_length, args.max_seq_length) print('Question') print(sample['question']) print('Passage') print(sample['passage']) answer = predict_one_sample(model, sample) print('Answer') print(answer)
def main(): parser = ArgumentParser() parser.add_argument('--pregenerated_data', type=Path, required=True) parser.add_argument('--output_dir', type=Path, required=True) parser.add_argument( "--bert_model", type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument("--do_lower_case", action="store_true") parser.add_argument( "--reduce_memory", action="store_true", help= "Store training data as on-disc memmaps to massively reduce memory usage" ) parser.add_argument("--epochs", type=int, default=3, help="Number of epochs to train for") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--log_every', type=int, default=100, help="Log every X batch") parser.add_argument("--mlm_only", action='store_true', help="Only use MLM objective") args = parser.parse_args() assert args.pregenerated_data.is_dir(), \ "--pregenerated_data should point to the folder of files made by pregenerate_training_data.py!" if args.output_dir.is_dir() and list(args.output_dir.iterdir()): print( f"Output directory ({args.output_dir}) already exists and is not empty!" ) args.output_dir.mkdir(parents=True, exist_ok=True) logger = util.get_logger(f'{args.output_dir}/exp.txt') for key, value in vars(args).items(): logger.info('command line argument: %s - %r', key, value) samples_per_epoch = [] for i in range(args.epochs): epoch_file = args.pregenerated_data / f"epoch_{i}.json" metrics_file = args.pregenerated_data / f"epoch_{i}_metrics.json" if epoch_file.is_file() and metrics_file.is_file(): metrics = json.loads(metrics_file.read_text()) samples_per_epoch.append(metrics['num_training_examples']) else: if i == 0: exit("No training data was found!") print( f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs})." ) print( "This script will loop over the available data, but training diversity may be negatively impacted." ) num_data_epochs = i break else: num_data_epochs = args.epochs if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) total_train_examples = 0 for i in range(args.epochs): # The modulo takes into account the fact that we may loop over limited epochs of data total_train_examples += samples_per_epoch[i % len(samples_per_epoch)] num_train_optimization_steps = int(total_train_examples / args.train_batch_size / args.gradient_accumulation_steps) if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model if args.mlm_only: model = BertForMaskedLM.from_pretrained(args.bert_model) else: model = BertForPreTraining.from_pretrained(args.bert_model) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) if args.mlm_only: param_optimizer = [ x for x in param_optimizer if 'bert.pooler' not in x[0] ] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 logger.info("***** Running training *****") logger.info(f" Num examples = {total_train_examples}") logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) model.train() for epoch in range(args.epochs): epoch_dataset = PregeneratedDataset( logger=logger, epoch=epoch, training_path=args.pregenerated_data, tokenizer=tokenizer, num_data_epochs=num_data_epochs, mlm_only=args.mlm_only) if args.local_rank == -1: train_sampler = RandomSampler(epoch_dataset) else: train_sampler = DistributedSampler(epoch_dataset) train_dataloader = DataLoader(epoch_dataset, sampler=train_sampler, batch_size=args.train_batch_size) tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 losses = [] with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch}") as pbar: for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) if args.mlm_only: input_ids, input_mask, segment_ids, lm_label_ids = batch loss = model(input_ids, segment_ids, input_mask, lm_label_ids) else: input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch loss = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 pbar.update(1) mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps pbar.set_postfix_str(f"Loss: {mean_loss:.5f}") losses.append(loss.item()) if step % args.log_every == 0: logger.info( f"loss at ep {epoch} batch {step}/{len(train_dataloader)} is {np.mean(losses):.5f}" ) losses = [] if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model logger.info("** ** * Saving fine-tuned model ** ** * ") model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = args.output_dir / f"epoch{epoch}_pytorch_model.bin" torch.save(model_to_save.state_dict(), str(output_model_file))
def fit_tfidf_model(dataset): if dataset == 'gab': data_processor = GabProcessor(configs) else: # dataset is 'ws' configs.data_dir = './data/white_supremacy/' data_processor = WSProcessor(configs) model = LogisticRegression() tokenizer = BertTokenizer.from_pretrained( configs.bert_model, do_lower_case=configs.do_lower_case) train_examples, val_examples = data_processor.get_train_examples(configs.data_dir), \ data_processor.get_dev_examples(configs.data_dir) random.shuffle(train_examples) gab_processor = GabProcessor(configs) gab_test_examples = gab_processor.get_test_examples( './data/majority_gab_dataset_25k/') _, train_labels, train_tokens = examples_to_bow(train_examples, tokenizer, configs.max_seq_length) _, val_labels, val_tokens = examples_to_bow(val_examples, tokenizer, configs.max_seq_length) _, test_labels, test_tokens = examples_to_bow(gab_test_examples, tokenizer, configs.max_seq_length) train_docs, val_docs = [' '.join(x) for x in train_tokens ], [' '.join(x) for x in val_tokens] # binary BOW vector performs better than tfidf #vectorizer = TfidfVectorizer(tokenizer=str.split) vectorizer = CountVectorizer(binary=True) X = vectorizer.fit_transform(train_docs) neg_weight = 0.125 if dataset == 'ws' else 0.1 weights = [1 if x == 1 else neg_weight for x in train_labels] model.fit(X, train_labels, weights) X_val = vectorizer.transform(val_docs) pred_gab_val = model.predict(X_val) f1 = f1_score(val_labels, pred_gab_val) print('val f1: %f' % f1) test_docs = [' '.join(x) for x in test_tokens] X_test = vectorizer.transform(test_docs) pred_gab_test = model.predict(X_test) gab_f1 = f1_score(test_labels, pred_gab_test) gab_p, gab_r = precision_score(test_labels, pred_gab_test), recall_score( test_labels, pred_gab_test) print('Gab test f1: %f (%f, %f)' % (gab_f1, gab_p, gab_r)) ws_processor, nyt_processor = WSProcessor(configs), NytProcessor( configs, subset=dataset == 'ws') ws_test_examples = ws_processor.get_test_examples('data/white_supremacy') _, test_labels, test_tokens = examples_to_bow(ws_test_examples, tokenizer, configs.max_seq_length) test_docs = [' '.join(x) for x in test_tokens] X_test = vectorizer.transform(test_docs) pred_ws_test = model.predict(X_test) ws_f1 = f1_score(test_labels, pred_ws_test) ws_p, ws_r = precision_score(test_labels, pred_ws_test), recall_score( test_labels, pred_ws_test) print('WS test f1: %f (%f, %f)' % (ws_f1, ws_p, ws_r)) nyt_test_examples = nyt_processor.get_test_examples( 'data/nyt_keyword_sample') _, test_labels, test_tokens = examples_to_bow(nyt_test_examples, tokenizer, configs.max_seq_length) test_docs = [' '.join(x) for x in test_tokens] X_test = vectorizer.transform(test_docs) pred_nyt_test = model.predict(X_test) nyt_f1 = accuracy_score(test_labels, pred_nyt_test) print('Nyt test f1: %f' % nyt_f1) dump_coeff(model, vectorizer) return gab_f1, gab_p, gab_r, ws_f1, ws_p, ws_r, nyt_f1
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument("--negative_weight", default=1., type=float) parser.add_argument("--neutral_words_file", default='data/identity.csv') # if true, use test data instead of val data parser.add_argument("--test", action='store_true') # Explanation specific arguments below # whether run explanation algorithms parser.add_argument("--explain", action='store_true', help='if true, explain test set predictions') parser.add_argument("--debug", action='store_true') # which algorithm to run parser.add_argument("--algo", choices=['soc']) # the output filename without postfix parser.add_argument("--output_filename", default='temp.tmp') # see utils/config.py parser.add_argument("--use_padding_variant", action='store_true') parser.add_argument("--mask_outside_nb", action='store_true') parser.add_argument("--nb_range", type=int) parser.add_argument("--sample_n", type=int) # whether use explanation regularization parser.add_argument("--reg_explanations", action='store_true') parser.add_argument("--reg_strength", type=float) parser.add_argument("--reg_mse", action='store_true') # whether discard other neutral words during regularization. default: False parser.add_argument("--discard_other_nw", action='store_false', dest='keep_other_nw') # whether remove neutral words when loading datasets parser.add_argument("--remove_nw", action='store_true') # if true, generate hierarchical explanations instead of word level outputs. # Only useful when the --explain flag is also added. parser.add_argument("--hiex", action='store_true') parser.add_argument("--hiex_tree_height", default=5, type=int) # whether add the sentence itself to the sample set in SOC parser.add_argument("--hiex_add_itself", action='store_true') # the directory where the lm is stored parser.add_argument("--lm_dir", default='runs/lm') # if configured, only generate explanations for instances with given line numbers parser.add_argument("--hiex_idxs", default=None) # if true, use absolute values of explanations for hierarchical clustering parser.add_argument("--hiex_abs", action='store_true') # if either of the two is true, only generate explanations for positive / negative instances parser.add_argument("--only_positive", action='store_true') parser.add_argument("--only_negative", action='store_true') # stop after generating x explanation parser.add_argument("--stop", default=100000000, type=int) # early stopping with decreasing learning rate. 0: direct exit when validation F1 decreases parser.add_argument("--early_stop", default=5, type=int) # other external arguments originally here in pytorch_transformers parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=32, type=int, help="Total batch size for eval.") parser.add_argument("--validate_steps", default=200, type=int, help="validate once for how many steps") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() combine_args(configs, args) args = configs if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = { 'gab': GabProcessor, 'ws': WSProcessor, 'nyt': NytProcessor, 'MT': MTProcessor, #'multi-label': multilabel_Processor, } output_modes = { 'gab': 'classification', 'ws': 'classification', 'nyt': 'classification' } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") #if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: # raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # save configs f = open(os.path.join(args.output_dir, 'args.json'), 'w') json.dump(args.__dict__, f, indent=4) f.close() task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) processor = processors[task_name](configs, tokenizer=tokenizer) output_mode = output_modes[task_name] label_list = processor.get_labels() num_labels = len(label_list) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format( args.local_rank)) if args.do_train: model = BertForSequenceClassification.from_pretrained( args.bert_model, cache_dir=cache_dir, num_labels=num_labels) else: model = BertForSequenceClassification.from_pretrained( args.output_dir, num_labels=num_labels) model.to(device) if args.fp16: model.half() if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) # elif n_gpu > 1: # model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) warmup_linear = WarmupLinearSchedule( warmup=args.warmup_proportion, t_total=num_train_optimization_steps) else: if args.do_train: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss, tr_reg_loss = 0, 0 tr_reg_cnt = 0 epoch = -1 val_best_f1 = -1 val_best_loss = 1e10 early_stop_countdown = args.early_stop if args.reg_explanations: train_lm_dataloder = processor.get_dataloader('train', configs.train_batch_size) dev_lm_dataloader = processor.get_dataloader('dev', configs.train_batch_size) explainer = SamplingAndOcclusionExplain( model, configs, tokenizer, device=device, vocab=tokenizer.vocab, train_dataloader=train_lm_dataloder, dev_dataloader=dev_lm_dataloader, lm_dir=args.lm_dir, output_path=os.path.join(configs.output_dir, configs.output_filename), ) else: explainer = None if args.do_train: epoch = 0 train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer, output_mode, configs) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) class_weight = torch.FloatTensor([args.negative_weight, 1]).to(device) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch # define a new function to compute loss values for both output_modes logits = model(input_ids, segment_ids, input_mask, labels=None) if output_mode == "classification": loss_fct = CrossEntropyLoss(class_weight) loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() loss = loss_fct(logits.view(-1), label_ids.view(-1)) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() if args.fp16: optimizer.backward(loss) else: loss.backward() # regularize explanations # NOTE: backward performed inside this function to prevent OOM if args.reg_explanations: reg_loss, reg_cnt = explainer.compute_explanation_loss( input_ids, input_mask, segment_ids, label_ids, do_backprop=True) tr_reg_loss += reg_loss # float tr_reg_cnt += reg_cnt nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if global_step % args.validate_steps == 0: val_result = validate(args, model, processor, tokenizer, output_mode, label_list, device, num_labels, task_name, tr_loss, global_step, epoch, explainer) val_acc, val_f1 = val_result['acc'], val_result['f1'] if val_f1 > val_best_f1: val_best_f1 = val_f1 if args.local_rank == -1 or torch.distributed.get_rank( ) == 0: save_model(args, model, tokenizer, num_labels) else: # halve the learning rate for param_group in optimizer.param_groups: param_group['lr'] *= 0.5 early_stop_countdown -= 1 logger.info( "Reducing learning rate... Early stop countdown %d" % early_stop_countdown) if early_stop_countdown < 0: break if early_stop_countdown < 0: break epoch += 1 # training finish ############################ # if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # if not args.explain: # args.test = True # validate(args, model, processor, tokenizer, output_mode, label_list, device, num_labels, # task_name, tr_loss, global_step=0, epoch=-1, explainer=explainer) # else: # args.test = True # explain(args, model, processor, tokenizer, output_mode, label_list, device) if not args.explain: args.test = True print('--Test_args.test: %s' % str(args.test)) #Test_args.test: True validate(args, model, processor, tokenizer, output_mode, label_list, device, num_labels, task_name, tr_loss, global_step=888, epoch=-1, explainer=explainer) args.test = False else: print('--Test_args.test: %s' % str(args.test)) # Test_args.test: True args.test = True explain(args, model, processor, tokenizer, output_mode, label_list, device) args.test = False
def __init__(self, config, coref_task_config): super(CorefModel, self).__init__(config) self.config = coref_task_config self.max_segment_len = self.config['max_segment_len'] self.max_span_width = self.config["max_span_width"] self.genres = {g: i for i, g in enumerate(self.config["genres"])} self.subtoken_maps = {} self.gold = {} self.eval_data = None self.bert_config = modeling.BertConfig.from_json_file(self.config["bert_config_file"]) self.tokenizer = BertTokenizer.from_pretrained(self.config['vocab_file'], do_lower_case=True) self.bert = BertModel(config=self.bert_config) self.dropout = nn.Dropout(self.config["dropout_rate"]) self.emb_dim = self.bert_config.hidden_size*2 + int(self.config["use_features"])*20 + int(self.config["model_heads"])*self.bert_config.hidden_size self.slow_antecedent_dim = self.emb_dim*3 + int(self.config["use_metadata"])*40 + int(self.config["use_features"])*20 + int(self.config['use_segment_distance'])*20 # span 长度 Embedding if self.config["use_features"]: self.span_width_embedding = nn.Embedding( num_embeddings=self.config["max_span_width"], embedding_dim=self.config["feature_size"]) # span head Embedding(ok) if self.config["model_heads"]: print("------加入span head 信息------") self.masked_mention_score = nn.Sequential( nn.Linear(self.bert_config.hidden_size, 1), Squeezer(dim=1)) # 计算指代得分,两层前向神经网络(ok) self.mention_scores = Score(self.emb_dim, self.config["ffnn_size"]) # prior_width_embedding if self.config['use_prior']: self.span_width_prior_embeddings = nn.Embedding( num_embeddings=self.config["max_span_width"], embedding_dim=self.config["feature_size"]) # 计算长度得分,两层前向神经网络 self.width_scores = Score(self.config["feature_size"], self.config["ffnn_size"]) # doc类别Embedding[7,20] self.genres_embedding = nn.Embedding( num_embeddings=len(self.genres), embedding_dim=self.config["feature_size"]) # 前c个前指的得分 一个分类器 + dropout self.fast_antecedent_scores = nn.Sequential( nn.Linear(self.emb_dim, self.emb_dim), nn.Dropout(self.config["dropout_rate"])) # 前指距离embedding if self.config['use_prior']: self.antecedent_distance_embedding = nn.Embedding( num_embeddings=10, embedding_dim=self.config["feature_size"]) self.antecedent_distance_linear = nn.Linear(self.config["feature_size"], 1) if self.config["use_metadata"]: # [2,20] self.same_speaker_embedding = nn.Embedding( num_embeddings=2, embedding_dim=self.config["feature_size"]) if self.config["use_features"]: self.antecedent_offset_embedding = nn.Embedding( num_embeddings=10, embedding_dim=self.config["feature_size"]) if self.config['use_segment_distance']: self.segment_distance_embedding = nn.Embedding( num_embeddings=self.config['max_training_sentences'], embedding_dim=self.config["feature_size"]) # 三维的输入 ffnn 两层前向神经网络 if self.config['fine_grained']: self.slow_antecedent_scores = nn.Sequential( nn.Linear(self.slow_antecedent_dim, self.config["ffnn_size"]), nn.ReLU(inplace=True), nn.Dropout(self.config["dropout_rate"]), nn.Linear(self.config["ffnn_size"], 1), Squeezer(dim=-1) ) # 分类器 + sigmoid self.coref_layer_linear = nn.Sequential( nn.Linear(self.emb_dim*2, self.emb_dim), nn.Sigmoid() ) self.apply(self.init_bert_weights)