def main(args): tokenizer = BertTokenizer.from_pretrained(args.bert_model) train_loader, _, _ = get_squad_data_loader(tokenizer, args.train_dir, shuffle=True, args=args) eval_data = get_squad_data_loader(tokenizer, args.dev_dir, shuffle=False, args=args) args.device = torch.cuda.current_device() trainer = VAETrainer(args) loss_log1 = tqdm(total=0, bar_format='{desc}', position=2) loss_log2 = tqdm(total=0, bar_format='{desc}', position=3) eval_log = tqdm(total=0, bar_format='{desc}', position=5) best_eval_log = tqdm(total=0, bar_format='{desc}', position=6) # Cargar checkpoint if args.load_checkpoint: epochs = trainer.loadd(args.model_dir) best_f1, best_bleu, best_em = VAETrainer.load_measures(args.model_dir) print( f"The current best measures are: F1 = {best_f1}, BLEU = {best_bleu} and EM = {best_em}." ) else: epochs = -1 best_bleu, best_em, best_f1 = 0.0, 0.0, 0.0 print("MODEL DIR: " + args.model_dir) mlflow_logger = init_mlflow(args, f"{args.model_dir}/mlruns") for epoch in trange(int(args.epochs), desc="Epoch", position=0): if epoch <= epochs: print(f"jumping epoch {epoch}...") else: for batch in tqdm(train_loader, desc="Train iter", leave=False, position=1): c_ids, q_ids, a_ids, start_positions, end_positions \ = batch_to_device(batch, args.device) trainer.train(c_ids, q_ids, a_ids, start_positions, end_positions) str1 = 'Q REC : {:06.4f} A REC : {:06.4f}' str2 = 'ZQ KL : {:06.4f} ZA KL : {:06.4f} INFO : {:06.4f}' str1 = str1.format(float(trainer.loss_q_rec), float(trainer.loss_a_rec)) str2 = str2.format(float(trainer.loss_zq_kl), float(trainer.loss_za_kl), float(trainer.loss_info)) loss_log1.set_description_str(str1) loss_log2.set_description_str(str2) if epoch >= 0: f1, em, bleu, _str = eval_measures(epoch, args, trainer, eval_data) eval_log.set_description_str(_str) result = {"epoch": epoch, "em": em, "f1": f1, "bleu": bleu} mlflow_logger.on_result(result) if em > best_em: best_em = em if f1 > best_f1: best_f1 = f1 trainer.save( os.path.join(args.model_dir, "best_f1_model.pt"), epoch, f1, bleu, em) if bleu > best_bleu: best_bleu = bleu trainer.save( os.path.join(args.model_dir, "best_bleu_model.pt"), epoch, f1, bleu, em) trainer.save(os.path.join(args.model_dir, "checkpoint.pt"), epoch, f1, bleu, em) mlflow_logger.on_checkpoint( f"{args.model_dir}/mlruns/checkpoint") _str = 'BEST BLEU : {:02.2f} EM : {:02.2f} F1 : {:02.2f}' _str = _str.format(best_bleu, best_em, best_f1) best_eval_log.set_description_str(_str)
random.seed(2019) logger = logging.getLogger('propaganda_predict_TC') PROP_CLASS = ['Appeal_to_Authority', 'Appeal_to_fear-prejudice', 'Bandwagon,Reductio_ad_hitlerum', 'Black-and-White_Fallacy', 'Causal_Oversimplification', 'Doubt', 'Exaggeration,Minimisation', 'Flag-Waving', 'Loaded_Language', 'Name_Calling,Labeling', 'Repetition', 'Slogans', 'Thought-terminating_Cliches', 'Whataboutism,Straw_Men,Red_Herring'] PRETRAINED_MODEL = 'bert-base-uncased' MAX_TOKEN = 128 EPOCHS = 5 BATCH_SIZE = 64 tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL) logger.info("Bert pretrained model: {0}".format(PRETRAINED_MODEL)) # load article files def loadArticleFiles(article_dir): articles = {} article_files = glob.glob(os.path.join(article_dir, "*.txt")) for filename in article_files: with open(filename, "r", encoding="utf-8") as f: content = f.read() article_id = os.path.basename(filename).split(".")[0][7:] if 'uncased' in PRETRAINED_MODEL: content = content.lower()
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--data_dir", default="whole_dataset_biobert", type=str, required=True, help= "The input data dir. Should contain the .json files (or other data files) for the task." ) parser.add_argument( "--out_dir", default="whole_dataset", type=str, required=True, help= "The directory where the output embeddings will be stored as a pickled dictionary" ) parser.add_argument( "--filter_file", default=None, type=str, required=False, help= "The input path to file which contains the names of files which should only be considered out of the entire dataset." ) parser.add_argument("--model_path", default=None, type=str, required=False, help="The path to the .bin transformer model.") parser.add_argument( "--have_input_data", action="store_true", help="Whether the input data is already stored in the form of Tensors") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.") parser.add_argument("--batch_size", default=16, type=int, help="The batch size to feed the model") parser.add_argument('--seed_words', nargs='+') args = parser.parse_args() add_seed_word(args.seed_words) logger.info("seed words given by user are %s", str(seed_words)) tokenizer = BertTokenizer.from_pretrained('bert-base-cased') if not args.have_input_data: json_files = extract_data(args.filter_file) data = preprocess_data_to_df(json_files) abstracts = data["abstract"].to_list() logger.info("total abstracts: %d", len(abstracts)) input_ids, attention_masks, _ = create_input_ids__attention_masks_tensor( data, tokenizer, args.max_seq_length) del data else: input_ids = torch.load(f"inputs/{args.data_dir}/input_ids.pt") attention_masks = torch.load( f"inputs/{args.data_dir}/attention_masks.pt") logger.info("%s", str(input_ids.shape)) logger.info('Token IDs: %s', str(input_ids[0])) if args.model_path is None: model = BertModel.from_pretrained("bert-base-cased") else: configuration = BertConfig.from_json_file( f"{args.model_path}/config.json") model = BertModel.from_pretrained( f"{args.model_path}/pytorch_model.bin", config=configuration) model.cuda() tensor_dataset = TensorDataset(input_ids, attention_masks) batch_size = args.batch_size dataloader = DataLoader(tensor_dataset, sampler=SequentialSampler(tensor_dataset), batch_size=batch_size) device = torch.device("cuda") seed_val = 42 random.seed(seed_val) np.random.seed(seed_val) torch.manual_seed(seed_val) torch.cuda.manual_seed_all(seed_val) # Measure the total training time for the whole run. total_t0 = time.time() logger.info("") logger.info('Forward pass...') model.eval() token_to_embedding_map = defaultdict(list) seed_embeddings = defaultdict(list) # number of times a token is encountered: needed to maintain the average token_count = defaultdict(int) t0 = time.time() for step, batch in enumerate(dataloader): if step % 100 == 0: logger.info('======== Batch {:} / {:} ========'.format( step, len(dataloader))) # `batch` contains three pytorch tensors: # [0]: input ids # [1]: attention masks b_input_ids = batch[0].to(device) b_input_mask = batch[1].to(device) embeddings, cls = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) # move everything to cpu to save GPU space b_input_ids_np = b_input_ids.cpu().numpy() b_input_mask_np = b_input_mask.cpu().numpy() embeddings_np = embeddings.detach().cpu().numpy() cls_np = cls.detach().cpu().numpy() del b_input_ids del b_input_mask del embeddings del cls torch.cuda.empty_cache() for batch_number in range(len(b_input_ids_np)): tokens = tokenizer.convert_ids_to_tokens( b_input_ids_np[batch_number]) for token, embedding in zip(tokens, embeddings_np[batch_number]): # add the seed word to the seed dict if token in seed_words: if token not in seed_embeddings: seed_embeddings[token] = embedding else: seed_embeddings[token] += embedding # every token including seed should also be added to token_to_embedding_map if token not in token_to_embedding_map and token not in stop_words: token_to_embedding_map[token] = embedding tokens_with_embeddings.add(token) elif token not in stop_words: token_to_embedding_map[token] += embedding token_count[token] += 1 if step % 1000 == 0 and step > 0: with open( f'word_embeddings/{args.out_dir}/word_embeddings_averaged_{step}.pickle', 'wb') as handle: pickle.dump(token_to_embedding_map, handle, protocol=pickle.HIGHEST_PROTOCOL) del token_to_embedding_map token_to_embedding_map = defaultdict(list) logger.info( "Time to find embeddings for batches {} to {}: {:} (h:mm:ss)". format(max(0, step - 500), step, format_time(time.time() - t0))) t0 = time.time() del b_input_ids_np del b_input_mask_np del embeddings_np del cls_np # save the embeddings of the seed words for token, embedding in seed_embeddings.items(): seed_embeddings[token] = embedding / (token_count[token] * 1.0) with open( f'word_embeddings/{args.out_dir}/seed_embeddings_averaged.pickle', 'wb') as handle: pickle.dump(seed_embeddings, handle, protocol=pickle.HIGHEST_PROTOCOL) del seed_embeddings # save the word embeddings with open( f'word_embeddings/{args.out_dir}/word_embeddings_averaged_{step}.pickle', 'wb') as handle: pickle.dump(token_to_embedding_map, handle, protocol=pickle.HIGHEST_PROTOCOL) del token_to_embedding_map # save the number of times each token occurs with open(f'word_embeddings/{args.out_dir}/token_count.pickle', 'wb') as handle: pickle.dump(token_count, handle, protocol=pickle.HIGHEST_PROTOCOL) del token_count logger.info( "Total time to complete the entire process: {:} (h:mm:ss)".format( format_time(time.time() - total_t0))) logger.info("\n") logger.info("Embeddings received!")
def feature_extracter_from_texts(self, mashup_api=None): """ 对mashup,service的description均需要提取特征,右路的文本的整个特征提取过程 公用的话应该封装成新的model! :param mashup_api: 默认是None,只有'HDP'/'Bert'时为非空 :return: 输出的是一个封装好的model,所以可以被mashup和api公用 """ if self.args.text_extracter_mode in fixed_vector_modes and mashup_api is not None: if self.args.text_extracter_mode == 'Bert': tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') bertModel = BertModel.from_pretrained("bert-base-uncased") if mashup_api == 'mashup': if self.mashup_text_feature_extracter is None: # 没求过 mashup_texts = get_iterable_values( data_repository.get_md().mashup_df, 'final_description', return_ele_type='str') dense_mashup_features = bertModel( tokenizer(mashup_texts, return_tensors='tf')) self.mashup_text_feature_extracter = vector_feature_extracter_from_texts( 'mashup', dense_mashup_features) return self.mashup_text_feature_extracter elif mashup_api == 'api': if self.api_text_feature_extracter is None: api_texts = get_iterable_values( data_repository.get_md().api_df, 'final_description', return_ele_type='str') dense_api_features = bertModel( tokenizer(api_texts, return_tensors='tf')) self.api_text_feature_extracter = vector_feature_extracter_from_texts( 'api', dense_api_features) return self.api_text_feature_extracter else: raise TypeError('wrong mashup_api mode!') else: if self.gd is None: self.gd = get_default_gd( tag_times=0, mashup_only=False, strict_train=True) # 用gensim处理文本,文本中不加tag self.gd.model_pcs(self.args.text_extracter_mode) # if mashup_api == 'mashup': if self.mashup_text_feature_extracter is None: # 没求过 self.mashup_text_feature_extracter = vector_feature_extracter_from_texts( 'mashup', self.gd.dense_mashup_features) return self.mashup_text_feature_extracter elif mashup_api == 'api': if self.api_text_feature_extracter is None: self.api_text_feature_extracter = vector_feature_extracter_from_texts( 'api', self.gd.dense_api_features) return self.api_text_feature_extracter else: raise TypeError('wrong mashup_api mode!') elif self.text_feature_extracter is None: # 没求过 if 'trainable_bert' in self.args.text_extracter_mode.lower(): self.text_feature_extracter = TFDistilBertModel.from_pretrained( "distilbert-base-uncased") # layer if self.args.frozen_bert: self.text_feature_extracter.trainable = False else: text_input = Input(shape=(self.args.MAX_SEQUENCE_LENGTH, ), dtype='int32') text_embedding_layer = self.get_text_embedding_layer( ) # 参数还需设为外部输入! text_embedded_sequences = text_embedding_layer( text_input) # 转化为2D if self.args.text_extracter_mode in ( 'inception', 'textCNN'): # 2D转3D,第三维是channel # print(text_embedded_sequences.shape) text_embedded_sequences = Lambda( lambda x: tf.expand_dims(x, axis=3))( text_embedded_sequences) # tf 和 keras的tensor 不同!!! print(text_embedded_sequences.shape) if self.args.text_extracter_mode == 'inception': x = inception_layer( text_embedded_sequences, self.args.embedding_dim, self.args.inception_channels, self.args.inception_pooling) # inception处理 print('built inception layer, done!') elif self.args.text_extracter_mode == 'textCNN': x = textCNN_feature_extracter_from_texts( text_embedded_sequences, self.args) elif self.args.text_extracter_mode == 'LSTM': x = LSTM_feature_extracter_from_texts( text_embedded_sequences, self.args) else: raise TypeError('wrong extracter!') print('text feature after inception/textCNN/LSTM whole_model,', x) # 观察MLP转化前,模块输出的特征 for FC_unit_num in self.args.inception_fc_unit_nums: x = Dense(FC_unit_num, kernel_regularizer=l2(self.args.l2_reg))( x) # , activation='relu' if self.args.inception_MLP_BN: x = BatchNormalization(scale=False)(x) x = PReLU()(x) # if self.args.inception_MLP_dropout: x = tf.keras.layers.Dropout(0.5)(x) self.text_feature_extracter = Model( text_input, x, name='text_feature_extracter') return self.text_feature_extracter
import os import sys import torch import torch.utils.data import torch.utils.data.distributed from transformers import BertTokenizer logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) logger.addHandler(logging.StreamHandler(sys.stdout)) MAX_LEN = 64 # this is the max length of the sentence print("Loading BERT tokenizer...") tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True) def model_fn(model_dir): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") loaded_model = torch.jit.load(os.path.join(model_dir, "traced_bert.pt")) return loaded_model.to(device) def input_fn(request_body, request_content_type): """An input_fn that loads a pickled tensor""" if request_content_type == "application/json": sentence = json.loads(request_body) input_ids = []
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_TYPES), ) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS), ) parser.add_argument( "--summary", default=None, type=str, help="Model summary", ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints and predictions will be written.", ) # Other parameters parser.add_argument( "--data_dir", default=None, type=str, help="The input data dir. Should contain the .json files for the task." + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.", ) parser.add_argument( "--feature_dir", default=None, type=str, help="The input feature dir. Should contain the cached_features_file for the task." ) parser.add_argument( "--train_file", default=None, type=str, help="The input training file. If a data dir is specified, will look for the file there" + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.", ) parser.add_argument( "--predict_file", default=None, type=str, help="The input evaluation file. If a data dir is specified, will look for the file there" + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.", ) parser.add_argument( "--test_file", default=None, type=str, help="The input test file.", ) parser.add_argument( "--test_prob_file", default=None, type=str, help="The output test_prob file.", ) parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name" ) parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name", ) parser.add_argument( "--cache_dir", default="", type=str, help="Where do you want to store the pre-trained models downloaded from s3", ) parser.add_argument( "--version_2_with_negative", action="store_true", help="If true, the SQuAD examples contain some that do not have an answer.", ) parser.add_argument( "--null_score_diff_threshold", type=float, default=0.0, help="If null_score - best_non_null is greater than the threshold predict null.", ) parser.add_argument( "--max_seq_length", default=512, type=int, help="The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded.", ) parser.add_argument( "--doc_stride", default=128, type=int, help="When splitting up a long document into chunks, how much stride to take between chunks.", ) parser.add_argument( "--max_query_length", default=32, type=int, help="The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.", ) parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") parser.add_argument("--do_test", action="store_true", help="Whether to run test on the test set.") parser.add_argument("--do_merge", action="store_true", help="Whether to merge test prob.") parser.add_argument( "--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step." ) parser.add_argument( "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model." ) parser.add_argument("--do_fgm", action="store_true", help="Whether to run Adv-FGM training.") parser.add_argument("--do_pgd", action="store_true", help="Whether to run Adv-PGD training.") parser.add_argument("--gc", action="store_true", help="Whether to run optimizer-gc training.") parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument( "--per_gpu_eval_batch_size", default=32, type=int, help="Batch size per GPU/CPU for evaluation." ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument( "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform." ) parser.add_argument( "--max_steps", default=-1, type=int, help="If > 0: set total number of training steps to perform. Override num_train_epochs.", ) parser.add_argument("--warmup_ratio", default=0.1, type=float, help="Linear warmup over warmup_ratio.") parser.add_argument( "--n_best_size", default=10, type=int, help="The total number of n-best predictions to generate in the nbest_predictions.json output file.", ) parser.add_argument( "--max_answer_length", default=32, type=int, help="The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.", ) parser.add_argument( "--verbose_logging", action="store_true", help="If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.", ) parser.add_argument( "--lang_id", default=0, type=int, help="language id of input for language-specific xlm models (see tokenization_xlm.PRETRAINED_INIT_CONFIGURATION)", ) parser.add_argument("--best_val_f1", type=float, default=0., help="best_val_f1") parser.add_argument("--best_val_step", type=int, default=0, help="best_val_step") parser.add_argument("--logging_ratio", type=float, default=0.1, help="Log every X updates ratio.") parser.add_argument("--save_ratio", type=float, default=0.1, help="Save checkpoint every X updates ratio.") parser.add_argument( "--eval_all_checkpoints", action="store_true", help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number", ) parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available") parser.add_argument( "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory" ) parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets" ) parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( "--fp16", action="store_true", help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", ) parser.add_argument( "--fp16_opt_level", type=str, default="O1", help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html", ) parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.") parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.") parser.add_argument("--threads", type=int, default=1, help="multiple threads for converting example to features") args = parser.parse_args() if args.doc_stride >= args.max_seq_length - args.max_query_length: logger.warning( "WARNING - You've set a doc stride which may be superior to the document length in some " "examples. This could result in errors when building features from the examples. Please reduce the doc " "stride or increase the maximum length to ensure the features are correctly built." ) if ( os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir ): raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format( args.output_dir ) ) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16, ) # Set seed set_seed(args) # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: # Make sure only the first process in distributed training will download model & vocab torch.distributed.barrier() args.model_type = args.model_type.lower() config = BertConfig.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, cache_dir=args.cache_dir if args.cache_dir else None, ) tokenizer = BertTokenizer.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir if args.cache_dir else None, ) model = BertForQuestionAnswering.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, cache_dir=args.cache_dir if args.cache_dir else None, ) if args.local_rank == 0: # Make sure only the first process in distributed training will download model & vocab torch.distributed.barrier() model.to(args.device) logger.info("Training/evaluation parameters %s", args) # Before we do anything with models, we want to ensure that we get fp16 execution of torch.einsum if args.fp16 is set. # Otherwise it'll default to "promote" mode, and we'll get fp32 operations. Note that running `--fp16_opt_level="O2"` will # remove the need for this code, but it is still valid. if args.fp16: try: import apex apex.amp.register_half_function(torch, "einsum") except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") # Training if args.do_train: train_dataset = load_and_cache_examples(args, tokenizer, set_type='train', output_examples=False) global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Save the trained model and the tokenizer if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` # Take care of distributed/parallel training model_to_save = model.module if hasattr(model, "module") else model model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, "training_args.bin")) # Load a trained model and vocabulary that you have fine-tuned model = BertForQuestionAnswering.from_pretrained(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) model.to(args.device) # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory results = {} if args.do_eval and args.local_rank in [-1, 0]: if args.do_train: logger.info("Loading checkpoints saved during training for evaluation") checkpoints = [args.output_dir] if args.eval_all_checkpoints: checkpoints = list( os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True)) ) logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce model loading logs else: logger.info("Loading checkpoint %s for evaluation", args.model_name_or_path) checkpoints = [args.model_name_or_path] logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: # Reload the model global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else "" model = BertForQuestionAnswering.from_pretrained(checkpoint) model.to(args.device) # Evaluate result = evaluate(args, model, tokenizer, prefix='dev', step=global_step) result = dict((k + ("_{}".format(global_step) if global_step else ""), v) for k, v in result.items()) results.update(result) logger.info("Results: {}".format(results)) if args.do_test and args.local_rank in [-1, 0]: if args.do_train: checkpoint = f'{args.output_dir}/checkpoint-{args.best_val_step}' model = BertForQuestionAnswering.from_pretrained(checkpoint) model.to(args.device) evaluate(args, model, tokenizer, prefix='test', step=args.best_val_step) else: global_step = args.model_name_or_path.split("-")[-1] model = BertForQuestionAnswering.from_pretrained(args.model_name_or_path) model.to(args.device) evaluate(args, model, tokenizer, prefix='test', step=global_step) if args.do_merge: merge(args, tokenizer, prefix="test")
cols_name = [ 'Date', 'Note', 'myr', 'uname', 'tuname', 'ADULT_CONTENT', 'HEALTH', 'DRUGS_ALCOHOL_GAMBLING', 'RACE', 'VIOLENCE_CRIME', 'POLITICS', 'RELATION', 'LOCATION' ] label_cols = cols_name[5:] # drop 'Date' & 'Note' (the 2 leftmost columns) sens_cols = [ 'ADULT_CONTENT', 'HEALTH', 'DRUGS_ALCOHOL_GAMBLING', 'RACE', 'VIOLENCE_CRIME', 'POLITICS', 'RELATION', 'LOCATION', 'T' ] personal_cols = ['A', 'E', 'I', 'P', 'T'] userfields = ['S', 'P', 'T', 'A'] bert_model_name = 'bert-base-uncased' tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=True) saved_model = BertClassifier(TFBertModel.from_pretrained(bert_model_name), len(label_cols)) saved_model.load_weights(MODEL_FILE) time.sleep(5) print("\n MODEL LOADED\n\n\n\n\n") c2 = c3 = c4 = c5 = c6 = c7 = c8 = c9 = [0] * (BATCH - 1) #===============================================================# """ Convert all letters to lower or upper case (common : lower case) """ def convert_letters(tokens, style="lower"):
def main(args, f): # args = parse_arguments() set_seed(args.train_seed) if args.model in ['roberta', 'distilroberta']: tokenizer = RobertaTokenizer.from_pretrained('roberta-base') else: tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # preprocess data src_eval_loader, src_loader, tgt_all_loader, tgt_train_loader = get_all_dataloader( args, tokenizer) # load models if args.model == 'bert': src_encoder = BertEncoder() tgt_encoder = BertEncoder() src_classifier = BertClassifier() elif args.model == 'distilbert': src_encoder = DistilBertEncoder() tgt_encoder = DistilBertEncoder() src_classifier = BertClassifier() elif args.model == 'roberta': src_encoder = RobertaEncoder() tgt_encoder = RobertaEncoder() src_classifier = RobertaClassifier() else: src_encoder = DistilRobertaEncoder() tgt_encoder = DistilRobertaEncoder() src_classifier = RobertaClassifier() discriminator = Discriminator() # parallel models if torch.cuda.device_count() > 1: print('Let\'s use {} GPUs!'.format(torch.cuda.device_count())) src_encoder = nn.DataParallel(src_encoder) src_classifier = nn.DataParallel(src_classifier) tgt_encoder = nn.DataParallel(tgt_encoder) discriminator = nn.DataParallel(discriminator) if args.load: src_encoder = init_model(args, src_encoder, restore_path=param.src_encoder_path) src_classifier = init_model(args, src_classifier, restore_path=param.src_classifier_path) # tgt_encoder = init_model(args, tgt_encoder, restore_path=param.tgt_encoder_path) # discriminator = init_model(args, discriminator, restore_path=param.d_model_path) else: src_encoder = init_model(args, src_encoder) src_classifier = init_model(args, src_classifier) tgt_encoder = init_model(args, tgt_encoder) discriminator = init_model(args, discriminator) # train source model if args.pretrain: print("=== Training classifier for source domain ===") src_encoder, src_classifier = pretrain(args, src_encoder, src_classifier, src_loader) # save pretrained model save_model(args, src_encoder, param.src_encoder_path) save_model(args, src_classifier, param.src_classifier_path) # eval source model print("=== Evaluating classifier for source domain ===") evaluate(args, src_encoder, src_classifier, src_loader) src_acc = evaluate(args, src_encoder, src_classifier, tgt_all_loader) f.write(f'{args.src} -> {args.tgt}: No adapt acc on src data: {src_acc}\n') for params in src_encoder.parameters(): params.requires_grad = False for params in src_classifier.parameters(): params.requires_grad = False # adapt print("=== Adapt tgt encoder ===") tgt_encoder.load_state_dict(src_encoder.state_dict()) if args.src_free: s_res_features = src_gmm(args, src_encoder, src_loader) src_loader = s_numpy_dataloader(s_res_features, args.batch_size) tgt_encoder = aad_adapt_src_free(args, src_encoder, tgt_encoder, discriminator, src_classifier, src_loader, tgt_train_loader, tgt_all_loader) else: tgt_encoder = aad_adapt(args, src_encoder, tgt_encoder, discriminator, src_classifier, src_loader, tgt_train_loader, tgt_all_loader) # save_model(args, tgt_encoder, param.tgt_encoder_path) # argument setting # print("=== Argument Setting ===") print( f"model_type: {args.model}; max_seq_len: {args.max_seq_length}; batch_size: {args.batch_size}; " f"pre_epochs: {args.pre_epochs}; num_epochs: {args.num_epochs}; adv weight: {args.alpha}; " f"KD weight: {args.beta}; temperature: {args.temperature}; src: {args.src}; tgt: {args.tgt}; " f'src_free: {args.src_free}; dp: {args.dp}; ent: {args.ent}') # eval target encoder on lambda0.1 set of target dataset print("=== Evaluating classifier for encoded target domain ===") print(">>> domain adaption <<<") tgt_acc = evaluate(args, tgt_encoder, src_classifier, tgt_all_loader) f.write(f'{args.src} -> {args.tgt}: DA acc on tgt data: {tgt_acc}\n') f.write( f"model_type: {args.model}; batch_size: {args.batch_size}; pre_epochs: {args.pre_epochs}; " f"num_epochs: {args.num_epochs}; src_free: {args.src_free}; src: {args.src}; " f"tgt: {args.tgt}; dp: {args.dp}; ent: {args.ent}\n\n")
LEARNING_RATE = 2e-5 TRAIN_EPOCHS = 10 financial_news_fp = "data/financial_news_data_downsampled.csv" df = pd.read_csv(financial_news_fp) df_train, df_val_test = train_test_split(df, test_size=0.3, random_state=RANDOM_SEED) df_val, df_test = train_test_split(df_val_test, test_size=0.3, random_state=RANDOM_SEED) df_train = df_train.reset_index(drop=True) df_val = df_val.reset_index(drop=True) df_test = df_test.reset_index(drop=True) class_names = ["negative", "neutral", "positive"] num_classes = len(df_train[LABEL_COL].unique()) tokenizer = BertTokenizer.from_pretrained(BERT_PRETRAINED_MODEL, do_lower_case=True) # Example sample_text = df_train[TEXT_COL].iloc[0] tokens = tokenizer.tokenize(sample_text) token_ids = tokenizer.convert_tokens_to_ids(tokens) print(tokens) print(token_ids) encoding = tokenizer.encode_plus( sample_text, max_length=MAX_LENGTH, add_special_tokens=True, return_token_type_ids=False, truncation="longest_first", padding="max_length", return_attention_mask=True,
def main(): MODEL_CACHE = './model/bert-base-chinese' WORD_2_VECTOR_MODEL_DIR = './model/merge_sgns_bigram_char300.txt' WORD_FREQ_DICT = './dict/modern_chinese_word_freq.txt' EVAL_FILE_PATH = './dataset/annotation_data.csv' BERT_RES_PATH = './data/bert_ss_res.csv' # ERNIE_RES_PATH = './data/ernie_output.csv' VECTOR_RES_PATH = './data/vector_ss_res.csv' DICT_RES_PATH = './data/dict_ss_res.csv' HOWNET_RES_PATH = './data/hownet_ss_res.csv' HYBRID_RES_PATH = './data/hybrid_ss_res.csv' SUBSTITUTION_NUM = 10 word_2_vector_model_dir = WORD_2_VECTOR_MODEL_DIR model_cache = MODEL_CACHE word_freq_dict = WORD_FREQ_DICT eval_file_path = EVAL_FILE_PATH bert_res_path = BERT_RES_PATH # ernie_res_path = ERNIE_RES_PATH vector_res_path = VECTOR_RES_PATH dict_res_path = DICT_RES_PATH hownet_res_path = HOWNET_RES_PATH hybrid_res_path = HYBRID_RES_PATH substitution_num = SUBSTITUTION_NUM print('loading models...') tokenizer = BertTokenizer.from_pretrained(model_cache) model = BertForMaskedLM.from_pretrained(model_cache) model.to('cuda') model.eval() print('loading embeddings...') model_word2vector = gensim.models.KeyedVectors.load_word2vec_format( word_2_vector_model_dir, binary=False) print('loading files...') word_freq_dict = read_dict(word_freq_dict) bert_res = read_ss_result(bert_res_path) vector_res = read_ss_result(vector_res_path) dict_res = read_ss_result(dict_res_path) hownet_res = read_ss_result(hownet_res_path) hybrid_res = read_ss_result(hybrid_res_path) row_lines, source_sentences, source_words = read_dataset(eval_file_path) for row_line, source_sentence, source_word, bert_subs, vector_subs, dict_subs, hownet_subs, hybrid_subs in zip( row_lines, source_sentences, source_words, bert_res, vector_res, dict_res, hownet_res, hybrid_res): # 全部运行可能耗时较长,建议注释部分代码块运行需要的测试 if bert_subs[0] != 'NULL': bert_pre_word, bert_ss_sorted = substitute_ranking( row_line, model_word2vector, model, tokenizer, source_sentence, source_word, bert_subs, word_freq_dict, substitution_num) else: bert_pre_word = 'NULL' bert_ss_sorted = ['NULL'] if vector_subs[0] != 'NULL': vector_pre_word, vector_ss_sorted = substitute_ranking( row_line, model_word2vector, model, tokenizer, source_sentence, source_word, vector_subs, word_freq_dict, substitution_num) else: vector_pre_word = 'NULL' vector_ss_sorted = ['NULL'] if dict_subs[0] != 'NULL': dict_pre_word, dict_ss_sorted = substitute_ranking( row_line, model_word2vector, model, tokenizer, source_sentence, source_word, dict_subs, word_freq_dict, substitution_num) else: dict_pre_word = 'NULL' dict_ss_sorted = ['NULL'] if hownet_subs[0] != 'NULL': hownet_pre_word, hownet_ss_sorted = substitute_ranking( row_line, model_word2vector, model, tokenizer, source_sentence, source_word, hownet_subs, word_freq_dict, substitution_num) else: hownet_pre_word = 'NULL' hownet_ss_sorted = ['NULL'] if hybrid_subs[0] != 'NULL': hybrid_pre_word, hybrid_ss_sorted = substitute_ranking( row_line, model_word2vector, model, tokenizer, source_sentence, source_word, hybrid_subs, word_freq_dict, substitution_num) else: hybrid_pre_word = 'NULL' hybrid_ss_sorted = ['NULL'] save_result(row_line, bert_pre_word, bert_ss_sorted, './test/data/nohownet/bert_sr_res_no_hownet.csv') save_result(row_line, vector_pre_word, vector_ss_sorted, './test/data/nohownet/vector_sr_res_no_hownet.csv') save_result(row_line, dict_pre_word, dict_ss_sorted, './test/data/nohownet/dict_sr_res_no_hownet.csv') save_result(row_line, hownet_pre_word, hownet_ss_sorted, './test/data/nohownet/hownet_sr_res_no_hownet.csv') save_result(row_line, hybrid_pre_word, hybrid_ss_sorted, './test/data/nohownet/hybrid_sr_res_no_hownet.csv')
import json import torch from tqdm import tqdm import argparse import pickle import random import numpy as np from transformers import BertTokenizer, BertForQuestionAnswering, BertConfig, BertPreTrainedModel tokenizer = BertTokenizer.from_pretrained('bert-base-chinese', do_lower_case=True) import logging logging.getLogger("transformers.tokenization_utils").setLevel(logging.ERROR) class Example(): def __init__(self, qid, question_text, answer_text, context_text, start_pos, title, answerable, answers): self.qid = qid self.question_text = question_text self.answer_text = answer_text self.context_text = context_text self.start_pos = start_pos self.title = title self.answerable = answerable self.answers = answers class testExample(): def __init__(
'max_len': 500, 'dropout_rate': 0.2, 'kernel_size': 5, 'num_patience': 3, 'lr': 3e-4, 'max_word_len': 1000, 'max_char_len': 10, 'char_embed_size': 100, 'cnn_filters': 300, 'cnn_kernel_size': 5, 'init_lr': 1e-4, 'max_lr': 8e-4 } bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', lowercase=True, add_special_tokens=True) albert_tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', lowercase=True, add_special_tokens=True) roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base', lowercase=True, add_special_tokens=True) xlnet_tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased', lowercase=True, add_special_tokens=True)
def test(args, testfile, true_label, save_flag: bool, seed_val): device = util.get_device(device_no=args.device_no) model = torch.load(args.model_path, map_location=device) # seed_val = 2346610 random.seed(seed_val) np.random.seed(seed_val) torch.manual_seed(seed_val) torch.cuda.manual_seed_all(seed_val) # testfile = args.output_file # true_label = args.label truncation = args.truncation n_samples = None if "n_samples" in args: n_samples = args.n_samples # saves_dir = "saves/" # time = datetime.datetime.now() # saves_path = os.path.join(saves_dir, util.get_filename(time)) # if save_flag: # Path(saves_path).mkdir(parents=True, exist_ok=True) # log_path = os.path.join(saves_path, "testing.log") # logging.basicConfig(filename=log_path, filemode='w', format='%(name)s - %(levelname)s - %(message)s') # logger=logging.getLogger() # logger.setLevel(logging.DEBUG) # Load the BERT tokenizer. # logger.info('Loading BERT tokenizer...') tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) max_len = 0 reviews = [] labels = [] with open(testfile, "r") as fin: reviews = fin.readlines() reviews = [rev.lower() for rev in reviews] if n_samples == None: n_samples = len(reviews) indices = np.random.choice(np.arange(len(reviews)), size=n_samples) selected_reviews = [reviews[idx] for idx in indices] labels = [0 if true_label == "negative" else 1] * len(selected_reviews) # For every sentence... # for rev in selected_reviews: # # Tokenize the text and add `[CLS]` and `[SEP]` tokens. # input_ids = tokenizer.encode(rev, add_special_tokens=True) # # Update the maximum sentence length. # max_len = max(max_len, len(input_ids)) # print('Max sentence length: ', max_len) # Tokenize all of the sentences and map the tokens to thier word IDs. input_ids = [] attention_masks = [] # For every sentence... for rev in selected_reviews: # `encode_plus` will: # (1) Tokenize the sentence. # (2) Prepend the `[CLS]` token to the start. # (3) Append the `[SEP]` token to the end. # (4) Map tokens to their IDs. # (5) Pad or truncate the sentence to `max_length` # (6) Create attention masks for [PAD] tokens. input_id = tokenizer.encode(rev, add_special_tokens=True) if len(input_id) > 512: if truncation == "tail-only": # tail-only truncation input_id = [tokenizer.cls_token_id] + input_id[-511:] elif truncation == "head-and-tail": # head-and-tail truncation input_id = [tokenizer.cls_token_id ] + input_id[1:129] + input_id[-382:] + [ tokenizer.sep_token_id ] else: # head-only truncation input_id = input_id[:511] + [tokenizer.sep_token_id] input_ids.append(torch.tensor(input_id).view(1, -1)) attention_masks.append( torch.ones([1, len(input_id)], dtype=torch.long)) else: encoded_dict = tokenizer.encode_plus( rev, # Sentence to encode. add_special_tokens=True, # Add '[CLS]' and '[SEP]' max_length=512, # Pad & truncate all sentences. pad_to_max_length=True, return_attention_mask=True, # Construct attn. masks. return_tensors='pt', # Return pytorch tensors. ) # Add the encoded sentence to the list. input_ids.append(encoded_dict['input_ids']) # And its attention mask (simply differentiates padding from non-padding). attention_masks.append(encoded_dict['attention_mask']) # Convert the lists into tensors. input_ids = torch.cat(input_ids, dim=0) attention_masks = torch.cat(attention_masks, dim=0) labels = torch.tensor(labels) # Set the batch size. batch_size = 8 # Create the DataLoader. prediction_data = TensorDataset(input_ids, attention_masks, labels) prediction_sampler = SequentialSampler(prediction_data) prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size) print('Predicting labels for {:,} test sentences...'.format( len(input_ids))) # Put model in evaluation mode model.eval() # Tracking variables predictions, true_labels = [], [] # Predict for batch in prediction_dataloader: # Add batch to GPU batch = tuple(t.to(device) for t in batch) # Unpack the inputs from our dataloader b_input_ids, b_input_mask, b_labels = batch # Telling the model not to compute or store gradients, saving memory and # speeding up prediction with torch.no_grad(): # Forward pass, calculate logit predictions outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) logits = outputs[0] # Move logits and labels to CPU logits = logits.detach().cpu().numpy() label_ids = b_labels.to('cpu').numpy() # Store predictions and true labels predictions.append(logits) true_labels.append(label_ids) print(' DONE.') return predictions, true_labels, reviews
def _create_examples(self, lines): with torch.no_grad(): if self.cfg.feature: fea_tokenizer = BertTokenizer.from_pretrained( osp.join(self.cfg.pretrained_lm_path, 'bert-base'), do_lower_case=True) feature = BertModel.from_pretrained( osp.join(self.cfg.pretrained_lm_path, 'bert-base')) if self.cfg.pretrained_bert is not None: print('loading feature ckpt from ', self.cfg.pretrained_bert) assert osp.exists(self.cfg.pretrained_bert) if self.cfg.cuda: feature = feature.cuda() checkpoint = torch.load(self.cfg.pretrained_bert) else: checkpoint = torch.load( self.cfg.pretrained_bert, map_location=lambda storage, loc: storage) feature.load_state_dict(checkpoint['net']) examples = [] index2qid = [] i = 0 if self.cfg.test: lines = lines[:200] for line in tqdm(lines): data = dict() data['index'] = i i += 1 data['qid'] = line['qID'] index2qid.append(data['qid']) sentence = line['sentence'] name1 = line['option1'] name2 = line['option2'] # data['sentence'] = line['sentence'] # data['option1'] = line['option1'] # data['option2'] = line['option2'] conj = "_" idx = sentence.index(conj) context = sentence[:idx] option_str = "_ " + sentence[idx + len(conj):].strip() option1 = option_str.replace("_", name1) option2 = option_str.replace("_", name2) options = [{ 'segment1': context, 'segment2': option1 }, { 'segment1': context, 'segment2': option2 }] # the test set has no answer key so use '1' as a dummy label data['label_ids'] = self.LABELS.index(line.get('answer', '1')) _, data['token_ids'], data['mask'], data[ 'segment_ids'] = self.example_to_token_ids_segment_ids_label_ids( options, self.tokenizer, cls_token_at_end=False, cls_token=self.tokenizer.cls_token, sep_token=self.tokenizer.sep_token, sep_token_extra=False, cls_token_segment_id=0, pad_on_left=False, pad_token=self.tokenizer.convert_tokens_to_ids( [self.tokenizer.pad_token])[0], pad_token_segment_id=0) if self.cfg.feature: if self.cfg.model == 'bert': input_ids = torch.Tensor(data['token_ids']).long() input_mask = torch.Tensor(data['mask']).long() segment_ids = torch.Tensor(data['segment_ids']).long() else: _, input_ids, segment_ids, input_mask = self.example_to_token_ids_segment_ids_label_ids( options, tokenizer=fea_tokenizer, cls_token_at_end=False, cls_token=fea_tokenizer.cls_token, sep_token=fea_tokenizer.sep_token, cls_token_segment_id=0, pad_on_left=False, pad_token=fea_tokenizer.convert_tokens_to_ids( [fea_tokenizer.pad_token])[0], pad_token_segment_id=0) input_ids = torch.Tensor(input_ids).long() input_mask = torch.Tensor(input_mask).long() segment_ids = torch.Tensor(segment_ids).long() if self.cfg.cuda: input_ids = input_ids.cuda() input_mask = input_mask.cuda() segment_ids = segment_ids.cuda() bert_outputs = feature(input_ids, attention_mask=input_mask, token_type_ids=segment_ids) data['feature'] = bert_outputs[0].cpu().data data['fea_mask'] = input_mask.cpu().data examples.append(data) torch.cuda.empty_cache() return examples, index2qid
jon_folder = 'C:/Users/mmall/Documents/github/bertembeddings/data/jonathans/adjacent/' random_model = False # random_model = True if random_model: # config = AutoConfig.from_pretrained(pretrained_weights, output_hidden_states=True, # output_attentions=args.attention, # cache_dir='pretrained_models') # model = AutoModel.from_config(config) model = BertModel( BertConfig(output_hidden_states=True, output_attentions=True)) else: model = BertModel.from_pretrained('bert-base-cased', output_hidden_states=True, output_attentions=True) tokenizer = BertTokenizer.from_pretrained('bert-base-cased') lines = pkl.load(open(jon_folder + 'phrase_boundary_tree_dist.pkl', 'rb')) #%% max_num = 300 these_bounds = [0, 1, 2, 3, 4] frob = [] nuc = [] inf = [] csim = [] avgdist = [] whichline = [] whichcond = [] whichswap = []
# Copyright (c) 2019 Baidu.com, Inc. All Rights Reserved # """ requirements: Authors: daisongtai([email protected]) Date: 2019/5/29 6:38 PM """ from __future__ import print_function import re from transformers import BertTokenizer max_seq_length = 500 tokenizer = BertTokenizer.from_pretrained('hfl/chinese-roberta-wwm-ext-large', do_lower_case=True) LHan = [ [0x2E80, 0x2E99], # Han # So [26] CJK RADICAL REPEAT, CJK RADICAL RAP [0x2E9B, 0x2EF3 ], # Han # So [89] CJK RADICAL CHOKE, CJK RADICAL C-SIMPLIFIED TURTLE [0x2F00, 0x2FD5], # Han # So [214] KANGXI RADICAL ONE, KANGXI RADICAL FLUTE 0x3005, # Han # Lm IDEOGRAPHIC ITERATION MARK 0x3007, # Han # Nl IDEOGRAPHIC NUMBER ZERO [0x3021, 0x3029], # Han # Nl [9] HANGZHOU NUMERAL ONE, HANGZHOU NUMERAL NINE [0x3038, 0x303A], # Han # Nl [3] HANGZHOU NUMERAL TEN, HANGZHOU NUMERAL THIRTY 0x303B, # Han # Lm VERTICAL IDEOGRAPHIC ITERATION MARK [ 0x3400, 0x4DB5
if os.path.exists(outDir): filelist = [f for f in os.listdir(outDir)] for f in filelist: os.remove(os.path.join(outDir, f)) else: os.makedirs(outDir) device = torch.device("cuda" if ( args.gpu and torch.cuda.is_available()) else "cpu") print('Device', device) n_gpu = torch.cuda.device_count() model = BertForSequenceClassification.from_pretrained( args.model_name_or_path, num_labels=args.num_labels) tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path, do_lower_case=True) model.cuda() train_inputs, train_labels, train_masks = readData(tokenizer, args, mode="train") validation_inputs, validation_labels, validation_masks = readData(tokenizer, args, mode="dev") train_inputs = torch.tensor(train_inputs) validation_inputs = torch.tensor(validation_inputs) train_labels = torch.tensor(train_labels) validation_labels = torch.tensor(validation_labels) train_masks = torch.tensor(train_masks)
def main(args): warnings.filterwarnings("ignore") # Load documents with open('./data/docs_noun.json', 'r') as f: json_docs = json.load(f) # prepare the dataset with open('data/test_anno.json', 'r') as f: val_json = json.load(f) tmpdir = 'tmp' if not os.path.exists(tmpdir): os.makedirs(tmpdir) # ------ Retrieve documents ------------------ if os.path.exists(tmpdir + '/eval_dr.json'): with open(tmpdir + '/eval_dr.json', 'r') as f: val_json = json.load(f) else: document_retrieval(args, json_docs, val_json) with open(tmpdir + '/eval_dr.json', 'w') as f: json.dump(val_json, f) # Calculate recall rank = [] for vidx, d in enumerate(val_json): reference = d['context'].split(' ') rank.append(99999) for i, didx in enumerate(d['dr_result']): hypothesis = json_docs[didx]['context'].split(' ') #if json_docs[didx]['context'] == d['context']: BLEUscore = nltk.translate.bleu_score.sentence_bleu([reference], hypothesis, weights=(0.5, 0.5)) if BLEUscore > 0.9: rank[-1] = i + 1 break recall5 = sum([1 for x in rank if x <= 5]) recall1 = sum([1 for x in rank if x <= 1]) print('DR R@1', recall1 / len(val_json), 'R@5', recall5 / len(val_json)) # ------ SS --------------------------------- # Make sure to pass do_lower_case=False when use multilingual-cased model. # See https://github.com/google-research/bert/blob/master/multilingual.md tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False) if os.path.exists(tmpdir + '/eval_ss.json'): with open(tmpdir + '/eval_ss.json', 'r') as f: val_json = json.load(f) else: sentence_selection(args, json_docs, val_json, tokenizer) with open(tmpdir + '/eval_ss.json', 'w') as f: json.dump(val_json, f) # Calculate recall rank = [] for vidx, d in enumerate(val_json): reference = d['reference'].split(' ') for i, (_, sent) in enumerate(d['ss_result']): hypothesis = sent.split(' ') #if sent == d['reference']: BLEUscore = nltk.translate.bleu_score.sentence_bleu([reference], hypothesis, weights=(0.5, 0.5)) if BLEUscore > 0.8: rank.append(i + 1) break recall5 = sum([1 for x in rank if x <= 5]) recall1 = sum([1 for x in rank if x <= 1]) print('SS R@1', recall1 / len(val_json), 'R@5', recall5 / len(val_json)) # ------ RTE ------------------------------------ if os.path.exists(tmpdir + '/eval_rte.pkl'): with open(tmpdir + '/eval_rte.pkl', 'rb') as f: val_json = pickle.load(f) else: rte(args, val_json, tokenizer) with open(tmpdir + '/eval_rte.pkl', 'wb') as f: pickle.dump(val_json, f) # Calculate accuracy name2label = {'TRUE': 0, 'FALSE': 1, 'NEI': 2} acc = [] for vidx, d in enumerate(val_json): gt = name2label[d['True_False']] pred, norm = 0, 0 if len(d['rte_result']) == 0: # No retrieved document in document retrieval acc.append(0) continue for rte_logit, sidx in d['rte_result']: pred += d['ss_result'][sidx][0] * rte_logit norm += d['ss_result'][sidx][0] pred = (pred / norm).argmax(0) acc.append(float(pred == gt)) print('RTE Acc', sum(acc) / len(acc))
from transformers import BertTokenizer # cat orig.jsonl | python scripts/preprocess_jsonl.py $(TARGET) > processed.tsv """ jsonl ファイルから文と topic と S-ID を取り出し、 S-ID <tab> topic <tab> 文 という形式の tsv に変換して出力する """ MAX_TOKEN_LENGTH = 192 - 2 # BERTKNP が扱える最大長 MAX_BYTE_SIZE = 4096 # Juman++ が扱える文の最大バイト数 JUMAN_COMMAND = '/mnt/violet/share/tool/juman++v2/bin/jumanpp' BERTKNP_MODEL = '/mnt/berry/home/ueda/bertknp-0.2-20190901/pretrained_model' jumanpp = Juman(command=JUMAN_COMMAND) tokenizer = BertTokenizer.from_pretrained(BERTKNP_MODEL) class Document(NamedTuple): did: str topic: str sentences: List[str] def main(): documents = [] idx = 0 for line in tqdm(sys.stdin.readlines()): input_obj = json.loads(line.strip()) classes = [ key for key, value in input_obj['classes'].items() if value == 1
early_stop = 20 train_data = pd.read_csv('./data/train.csv') test_data = pd.read_csv('./data/test.csv') feature_cols = ['query', 'reply'] label_cols = ['label'] kf = KFold(n_splits=5) res_proba = np.zeros((len(test_data), 2)) for tr_idx, val_idx in kf.split(train_data): train_x, train_y = train_data[feature_cols].loc[tr_idx], train_data[ label_cols].loc[tr_idx] val_x, val_y = train_data[feature_cols].loc[val_idx], train_data[ label_cols].loc[val_idx] tokenizer = BertTokenizer.from_pretrained(model_name) train_encodings = tokenizer(train_x['query'].tolist(), train_x['reply'].tolist(), truncation=True, padding=True, max_length=max_seq_len) val_encodings = tokenizer(val_x['query'].tolist(), val_x['reply'].tolist(), truncation=True, padding=True, max_length=max_seq_len) test_encodings = tokenizer(test_data['query'].tolist(), test_data['reply'].tolist(), truncation=True, padding=True, max_length=max_seq_len)
import pandas as pd from transformers import BertTokenizer from abstractive_summarizer import AbstractiveSummarization from hyper_parameters import h_parms from configuration import config model = AbstractiveSummarization(num_layers=config.num_layers, d_model=config.d_model, num_heads=config.num_heads, dff=config.dff, vocab_size=config.input_vocab_size, output_seq_len=config.summ_length, rate=h_parms.dropout_rate) tokenizer = BertTokenizer.from_pretrained(config.pretrained_bert_model) def create_dataframe(path, num_examples): df = pd.read_csv(path) df.columns = [ i.capitalize() for i in df.columns if i.lower() in ['document', 'summary'] ] assert len(df.columns) == 2, 'column names should be document and summary' df = df[:num_examples] assert not df.isnull().any().any(), 'dataset contains nans' return (df["Document"].values, df["Summary"].values)
def load_tokenizer(args): tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) tokenizer.add_special_tokens({"additional_special_tokens": ADDITIONAL_SPECIAL_TOKENS}) return tokenizer
"data_cache_dir": "/home/ubuntu/likun/huggingface_dataset", "train_size": 500, "val_size": 30, "test_size": 50, "max_length": 128, "shuffle": True, } pre_trained_model_name = 'bert-google-uncase-base' run_name = "zero-shot-metric-learning-benchmark-topic-medium-changelabel" # pre_trained_model_name = 'roberta-base' logger.critical("Build pre-trained model {}".format(pre_trained_model_name)) base_pre_trained_model_path = '/home/ubuntu/likun/nlp_pretrained/{}'.format( pre_trained_model_name) # trained_model_path = '/home/ubuntu/likun/nlp_save_kernels/zero-shot-metric-learning-benchmark-topic-small' # tokenizer = AutoTokenizer.from_pretrained(trained_model_path) tokenizer = BertTokenizer.from_pretrained(base_pre_trained_model_path) from datasets.features import ClassLabel from datasets.features import Features yahoo_zsl_path = '/home/ubuntu/likun/nlp_data/zsl/BenchmarkingZeroShot/topic_yahoo' fea = Features({ "text": datasets.Value("string"), "label": ClassLabel(names_file=os.path.join(yahoo_zsl_path, 'classes.txt')) }) download_config = datasets.DownloadConfig() download_config.max_retries = 20 dataset = datasets.load_dataset('csv', data_files={
def get_tokenizer(self): return BertTokenizer.from_pretrained('bert-base-uncased', cache_dir=HF_CACHE_DIR)
def main(): config = { 'overwrite': True, 'data_path': '../tcdata/nlp_round2_data', 'data_cache_path': '../user_data/tmp_data/finetune_output/nezha_ngram_cv7_processed/data.pkl', 'output_path': '../user_data/tmp_data/finetune_output/nezha_ngram_cv7_results', 'model_path': '../user_data/tmp_data/pretrain_output/nezha_ngram_output/best_model_ckpt', 'best_model_path': '', 'batch_size': 64, # 64 'num_epochs': 3, # 3 'num_folds': 5, # 7 'cv': 'cv-', 'max_seq_len': 32, 'learning_rate': 2e-5, 'eps': 0.1, 'alpha': 0.3, 'adv': 'fgm', 'warmup_ratio': 0.1, 'weight_decay': 0.01, 'device': 'cuda:2', 'logging_step': 500, # 500 'ema_start_step': 1500, # 1500 'ema_start': False, 'seed': 20200409 } if not torch.cuda.is_available(): config['device'] = 'cpu' else: config['n_gpus'] = torch.cuda.device_count() config['batch_size'] *= config['n_gpus'] if not os.path.exists(config['output_path']): os.makedirs((config['output_path'])) tokenizer = BertTokenizer.from_pretrained( '../user_data/tmp_data/pretrain_output/nezha_ngram_output' '/nezha_ngram_tokenizer_and_config/vocab.txt') if not os.path.exists(config['data_cache_path']) or config['overwrite']: read_data(config, tokenizer, debug=False) collate_fn, test_dataloader, train_dev_data, eval_train_dataloader = load_data( config, tokenizer) # test_pred_df = pd.DataFrame(data={'id': range(len(test_dataloader.dataset) // 2), # 'fold1-probs': [0.0] * (len(test_dataloader.dataset) // 2), # 'fold1-logits0': [0.0] * (len(test_dataloader.dataset) // 2), # 'fold1-logits1': [0.0] * (len(test_dataloader.dataset) // 2), # }) # train_pred_df = pd.DataFrame(data={'id': range(len(train_dev_data['input_ids'])), # 'fold1-probs': [0.0] * len(train_dev_data['input_ids']), # 'fold1-logits0': [0.0] * len(train_dev_data['input_ids']), # 'fold1-logits1': [0.0] * len(train_dev_data['input_ids'])} # ) fold = 0 skf = StratifiedKFold(shuffle=True, n_splits=config['num_folds'], random_state=config['seed']) for train_idxs, dev_idxs in skf.split(X=train_dev_data['input_ids'], y=train_dev_data['labels']): fold += 1 config['ema_start'] = False dev_dataloader, train_dataloader = load_cv_data( collate_fn, config, dev_idxs, train_dev_data, train_idxs, None, None) seed_everyone(config['seed']) if not config['best_model_path']: best_model_path = train(config, train_dataloader, dev_dataloader, fold) else: best_model_path = config['best_model_path'] if best_model_path: print('\n>>> Loading best model ...') model = NeZhaForSequenceClassification.from_pretrained( best_model_path) model.to(config['device']) del model # train_pred_probs, train_pred_logits = predict(config, model, eval_train_dataloader, mode='valid') # train_pred_df.loc[:, f'fold{fold}-probs'] = train_pred_probs # train_pred_df.loc[:, f'fold{fold}-logits0'] = train_pred_logits[:, 0] # train_pred_df.loc[:, f'fold{fold}-logits1'] = train_pred_logits[:, 1] # test_pred_probs, test_pred_logits = predict(config, model, test_dataloader, mode='test') # test_pred_df.loc[:, f'fold{fold}-probs'] = test_pred_probs # test_pred_df.loc[:, f'fold{fold}-logits0'] = test_pred_logits[:, 0] # test_pred_df.loc[:, f'fold{fold}-logits1'] = test_pred_logits[:, 1] del train_dataloader, dev_dataloader gc.collect() torch.cuda.empty_cache()
if self.phase != "test": y = self.categories[idx] y = self.idx2onehot(y) X["y"] = torch.LongTensor(y) return X def pad(self, arr): return arr[:self.max_len] + [self.pad_token_id ] * (self.max_len - len(arr)) def idx2onehot(self, y): onehot = np.zeros(self.n_outputs) onehot[y] = 1 return onehot if __name__ == "__main__": from transformers import BertTokenizer from pprint import pprint _root = "data/corona_nlp" _phases = ["train", "test", "dev"] _tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") _max_len = 25 for _phase in _phases: dataset = CustomDataset(_root, _phase, _tokenizer, _max_len) for res in dataset: pprint(res) print(res['y'].size()) break
def train_ner_model( model_config_path, data_dir, logger_file_dir=None, labels_file=None ): # loading model config path if os.path.exists(model_config_path): with open(model_config_path, "r", encoding="utf-8") as reader: text = reader.read() model_config_dict = json.loads(text) else: print("model_config_path doesn't exist.") sys.exit() if os.path.exists(model_config_dict["final_model_saving_dir"]): output_model_file = model_config_dict["final_model_saving_dir"] + "pytorch_model.bin" output_config_file = model_config_dict["final_model_saving_dir"] + "bert_config.json" output_vocab_file = model_config_dict["final_model_saving_dir"] + "vocab.txt" else: print("model_saving_dir doesn't exist.") sys.exit() if os.path.exists(logger_file_dir): logging.basicConfig( filename=logger_file_dir + "logs.txt", filemode="w" ) logger = logging.getLogger() logger.setLevel(logging.DEBUG) else: print("logger_file_path doesn't exist.") sys.exit() if os.path.exists(labels_file): print("Labels file exist") else: print("labels_file doesn't exist.") sys.exit() logger.info("Training configurations are given below ::") for key, val in model_config_dict.items(): logger.info("{} == {}".format(key, val)) logger.info("Started training model :::::::::::::::::::::") bert_config = BertConfig.from_json_file(model_config_dict["bert_config_path"]) bert_tokenizer = BertTokenizer.from_pretrained( model_config_dict["bert_vocab_path"], config=bert_config, do_lower_case=model_config_dict["tokenizer_do_lower_case"] ) # saving confgi and tokenizer bert_tokenizer.save_vocabulary(output_vocab_file) bert_config.to_json_file(output_config_file) labels = get_labels(labels_file) logger.info("Labels for Ner are: {}".format(labels)) label2idx = {l: i for i, l in enumerate(labels)} # preparing training data train_dataset = load_and_cache_examples( data_dir=data_dir, max_seq_length=model_config_dict["max_seq_length"], tokenizer=bert_tokenizer, label_map=label2idx, pad_token_label_id=label2idx["O"], mode="train", logger=logger ) # preparing eval data eval_dataset = load_and_cache_examples( data_dir=data_dir, max_seq_length=model_config_dict["max_seq_length"], tokenizer=bert_tokenizer, label_map=label2idx, pad_token_label_id=label2idx["O"], mode="dev", logger=logger ) logger.info("Training data and eval data loaded successfully.") if model_config_dict["model_type"] == "crf": model = BertCrfForNER.from_pretrained( model_config_dict["bert_model_path"], config=bert_config, pad_idx=bert_tokenizer.pad_token_id, sep_idx=bert_tokenizer.sep_token_id, num_labels=len(labels) ) logger.info("{} model loaded successfully.".format(model_config_dict["model_type"])) # checking whether to finetune or not if model_config_dict["finetune"] == True: logger.info("Finetuning bert.") else: for param in list(model.bert.parameters()): param.requires_grad = False logger.infd("Freezing Berts weights.") # preparing optimizer and scheduler no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 } ] # total optimizer steps t_total = int((len(train_dataset) / model_config_dict["train_batch_size"]) * model_config_dict["num_epochs"]) logger.info("t_total : {}".format(t_total)) optimizer = AdamW( optimizer_grouped_parameters, lr=model_config_dict["learning_rate"], eps=model_config_dict["epsilon"] ) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=model_config_dict["warmup_steps"], num_training_steps=t_total ) logger.info("{}".format(count_parameters)) model.to(DEVICE) best_eval_f1 = 0.0 for epoch in range(model_config_dict["num_epochs"]): train_result = train_epoch( model=model, dataset=train_dataset, batch_size=model_config_dict["train_batch_size"], label_map=label2idx, max_grad_norm=model_config_dict["max_grad_norm"], optimizer=optimizer, scheduler=scheduler, device=DEVICE, sep_token_id=bert_tokenizer.sep_token_id ) eval_result = eval_epoch( model=model, dataset=eval_dataset, batch_size=model_config_dict["validation_batch_size"], label_map=label2idx, device=DEVICE, sep_token_id=bert_tokenizer.sep_token_id, give_lists=False ) print(f'Epoch: {epoch + 1}') print(f'Train Loss: {train_result["loss"]: .4f}| Train F1: {train_result["f1"]: .4f}') print(f'Eval Loss: {eval_result["loss"]: .4f}| Eval F1: {eval_result["f1"]: .4f}') logger.info(f'Epoch: {epoch + 1}') logger.info(f'Train Loss: {train_result["loss"]: .4f}| Train F1: {train_result["f1"]: .4f}') logger.info(f'Eval Loss: {eval_result["loss"]: .4f}| Eval F1: {eval_result["f1"]: .4f}') if best_eval_f1 < eval_result["f1"]: best_eval_f1 = eval_result["f1"] # saving model to disk model_to_save = model.module if hasattr(model, "module") else model torch.save(model_to_save.state_dict(), output_model_file) print("Saved a better model.") logger.info("Saved a beter model") del model_to_save # loading the best model and test results model.load_state_dict(torch.load(output_model_file)) logger.info("Loaded best model successfully.") test_dataset, test_examples, test_features = load_and_cache_examples( data_dir=data_dir, max_seq_length=model_config_dict["max_seq_length"], tokenizer=bert_tokenizer, label_map=label2idx, pad_token_label_id=label2idx["O"], mode="test", logger=logger, return_features_and_examples=True ) logger.info("Test data loaded successfully.") test_label_predictions = predictions_from_model( model=model, tokenizer=bert_tokenizer, dataset=test_dataset, batch_size=model_config_dict["validation_batch_size"], label2idx=label2idx, device=DEVICE ) # restructure test_label_predictions with real labels aligned_predicted_labels, true_labels = align_predicted_labels_with_original_sentence_tokens( test_label_predictions, test_examples, test_features, max_seq_length=model_config_dict["max_seq_length"], num_special_tokens=model_config_dict["num_special_tokens"] ) print("Test Results classification report...") print(classification_report(true_labels, aligned_predicted_labels)) return aligned_predicted_labels, true_labels
Note: I believe this model was trained on version 1 of SQuAD, since it's not outputting whether the question is "impossible" to answer from the text (which is part of the task in v2 of SQuAD). """ from transformers import BertForQuestionAnswering model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad') """Load the tokenizer as well. Side note: Apparently the vocabulary of this model is identicaly to the one in bert-base-uncased. You can load the tokenizer from `bert-base-uncased` and that works just as well. """ from transformers import BertTokenizer tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad') """## 3. Ask a Question Now we're ready to feed in an example! A QA example consists of a question and a passage of text containing the answer to that question. Let's try an example using the text in this tutorial! """ question = "How many parameters does BERT-large have?" answer_text = "BERT-large is really big... it has 24-layers and an embedding size of 1,024, for a total of 340M parameters! Altogether it is 1.34GB, so expect it to take a couple minutes to download to your Colab instance." """We'll need to run the BERT tokenizer against both the `question` and the `answer_text`. To feed these into BERT, we actually concatenate them together and place the special [SEP] token in between."""
os.path.join( args.data_path, "train_toy.csv" if args.toy in ["True", "toy"] else "train.csv")) test_df = pd.read_csv( os.path.join( args.data_path, "test_toy.csv" if args.toy in ["True", "toy"] else "test.csv")) submission = pd.read_csv( os.path.join( args.data_path, "sample_submission_toy.csv" if args.toy in ["True", "toy"] else "sample_submission.csv", )) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=("uncased" in args.bert_model)) test_set = get_test_set(args, test_df, tokenizer) test_loader = DataLoader( test_set, batch_sampler=BucketingSampler(test_set.lengths, batch_size=args.batch_size, maxlen=args.max_sequence_length), collate_fn=make_collate_fn(), ) for fold, train_set, valid_set, train_fold_df, val_fold_df in cross_validation_split( args, train_df, tokenizer): print()
def load_model_and_tokenizer(): global model_path model = load_model(model_path).cpu() tokenizer = BertTokenizer.from_pretrained(tokenizer_path) tokenizer.add_special_tokens({"additional_special_tokens": ['[BLANK]']}) return model, tokenizer