def load_train_model(): bert_model = '../models/checkpoint' cache_dir = os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(-1)) processor = processors['cwspku']() label_list = processor.get_labels() model = ZenForTokenClassification.from_pretrained(bert_model, cache_dir=cache_dir, num_labels=len(label_list) + 1, multift=False) # model.load_state_dict(torch.load('./results/result-tokenlevel-2020-03-31-15-52-51/checkpoint-27/pytorch_model.bin')) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) model = torch.nn.DataParallel(model) model.eval() tokenizer = BertTokenizer.from_pretrained('bert-base-chinese', do_lower_case=True) ngram_dict = ZenNgramDict(bert_model, tokenizer=tokenizer) params = model, tokenizer, ngram_dict, label_list return params
def main(): parser = argparse.ArgumentParser() now_time = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S') ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default='./results/result-tokenlevel-{}'.format(now_time), type=str, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument("--multift", action='store_true', help="True for multi-task fine tune") parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=32, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.") args = parser.parse_args() args.task_name = args.task_name.lower() # Setup logging logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', filemode='w', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) if args.local_rank == -1 or args.no_cuda: args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) args.device = torch.device("cuda", args.local_rank) args.n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(args.device, args.n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps # Set seed set_seed(args) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: print("Output directory already exists and is not empty.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() num_labels = len(label_list) + 1 # Prepare model tokenizer tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) ngram_dict = ZenNgramDict(args.bert_model, tokenizer=tokenizer) cache_dir = args.cache_dir if args.cache_dir else os.path.join( str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format( args.local_rank)) model = ZenForTokenClassification.from_pretrained(args.bert_model, cache_dir=cache_dir, num_labels=num_labels, multift=args.multift) model.to(args.device) if args.do_train: train(args, model, tokenizer, ngram_dict, processor, label_list) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): result = evaluate(args, model, tokenizer, ngram_dict, processor, label_list) logger.info("\nf1=%s\n" % (str(result["f1"])))
"masked_lm_labels": masked_lm_labels, } instances.append(instance) # recover current_chunk = [] current_length = 0 i += 1 return instances if __name__ == '__main__': tokenizer = BertTokenizer.from_pretrained(CORPUS_PATH, do_lower_case=False) vocab_list = list(tokenizer.vocab.keys()) ngram_dict = ZenNgramDict(CORPUS_PATH, tokenizer=tokenizer) # 读取以空格分割的文档 with DocumentDatabase(reduce_memory=args.reduce_memory) as docs: with open(CORPUS_PATH / 'corpus_256.txt', 'r', encoding='utf-8') as f: doc = [] for line in tqdm(f, desc="Loading Dataset", unit="lines"): line = line.strip() if line == "": docs.add_document(doc) doc = [] else: tokens = tokenizer.tokenize(line) doc.append(tokens) if doc: docs.add_document( doc
def main(): parser = ArgumentParser() parser.add_argument('--train_corpus', type=Path, required=True) parser.add_argument("--output_dir", type=Path, required=True) parser.add_argument( "--bert_model", type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument("--do_lower_case", action="store_true") parser.add_argument( "--do_whole_word_mask", action="store_true", help= "Whether to use whole word masking rather than per-WordPiece masking.") parser.add_argument( "--reduce_memory", action="store_true", help= "Reduce memory usage for large datasets by keeping data on disc rather than in memory" ) parser.add_argument("--epochs_to_generate", type=int, default=3, help="Number of epochs of data to pregenerate") parser.add_argument("--max_seq_len", type=int, default=128) parser.add_argument( "--short_seq_prob", type=float, default=0.1, help="Probability of making a short sentence as a training example") parser.add_argument( "--masked_lm_prob", type=float, default=0.15, help="Probability of masking each token for the LM task") parser.add_argument( "--max_predictions_per_seq", type=int, default=20, help="Maximum number of tokens to mask in each sequence") parser.add_argument("--ngram_list", type=str, default="/data/zhwiki/ngram.txt") parser.add_argument("--max_ngram_in_sequence", type=int, default=20) args = parser.parse_args() tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) vocab_list = list(tokenizer.vocab.keys()) ngram_dict = ZenNgramDict(args.bert_model, tokenizer=tokenizer) with DocumentDatabase(reduce_memory=args.reduce_memory) as docs: with args.train_corpus.open() as f: doc = [] for line in tqdm(f, desc="Loading Dataset", unit=" lines"): line = line.strip() if line == "": docs.add_document(doc) doc = [] else: tokens = tokenizer.tokenize(line) doc.append(tokens) if doc: docs.add_document( doc ) # If the last doc didn't end on a newline, make sure it still gets added if len(docs) <= 1: exit( "ERROR: No document breaks were found in the input file! These are necessary to allow the script to " "ensure that random NextSentences are not sampled from the same document. Please add blank lines to " "indicate breaks between documents in your input file. If your dataset does not contain multiple " "documents, blank lines can be inserted at any natural boundary, such as the ends of chapters, " "sections or paragraphs.") args.output_dir.mkdir(exist_ok=True) for epoch in trange(args.epochs_to_generate, desc="Epoch"): epoch_filename = args.output_dir / f"epoch_{epoch}.json" num_instances = 0 with epoch_filename.open('w') as epoch_file: for doc_idx in trange(len(docs), desc="Document"): doc_instances = create_instances_from_document( docs, doc_idx, max_seq_length=args.max_seq_len, short_seq_prob=args.short_seq_prob, masked_lm_prob=args.masked_lm_prob, max_predictions_per_seq=args.max_predictions_per_seq, whole_word_mask=args.do_whole_word_mask, vocab_list=vocab_list, ngram_dict=ngram_dict) doc_instances = [ json.dumps(instance) for instance in doc_instances ] for instance in doc_instances: epoch_file.write(instance + '\n') num_instances += 1 metrics_file = args.output_dir / f"epoch_{epoch}_metrics.json" with metrics_file.open('w') as metrics_file: metrics = { "num_training_examples": num_instances, "max_seq_len": args.max_seq_len, "max_ngram_in_sequence": args.max_ngram_in_sequence } metrics_file.write(json.dumps(metrics))