def main(): logging.set_verbosity_info() parser = argparse.ArgumentParser() parser.add_argument('--model_path', default='best_model_ckpt_0', type=str) parser.add_argument('--seed', default=202105, type=int) args = parser.parse_args() seed_random(args.seed) data_path = './user_data/duality_pair_pretrain_no_nsp.txt' vocab_path = './user_data/vocab.txt' model_path = './user_data/nezha-cn-base' output_path = './user_data/pretrained-nezha-base' tokenizer = BertTokenizer.from_pretrained(vocab_path) data = read_data(data_path, tokenizer) train_dataset = TcDataset(data) model = NeZhaForMaskedLM.from_pretrained(model_path) model.resize_token_embeddings(tokenizer.vocab_size) data_collator = TcCollator(max_seq_len=30, tokenizer=tokenizer, mlm_probability=0.15) logging_path = os.path.join(output_path, 'log') model_save_path = os.path.join(output_path, args.model_path) tokenizer_and_config = os.path.join(output_path, 'tokenizer_and_config') build_path(model_save_path) build_path(logging_path) build_path(tokenizer_and_config) training_args = TrainingArguments(output_dir=output_path, overwrite_output_dir=True, learning_rate=6e-5, num_train_epochs=130, per_device_train_batch_size=128, logging_steps=5000, fp16=True, fp16_backend='amp', load_best_model_at_end=True, prediction_loss_only=True, logging_dir=logging_path, logging_first_step=True, dataloader_num_workers=4, seed=2021) trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, ) trainer.train() trainer.save_model(model_save_path) tokenizer.save_pretrained(tokenizer_and_config)
def main(): seed_everything(2021) logging.set_verbosity_info() corpus_path = './user_data/r2_corpus.txt' vocab_path = './user_data/r2_vocab_total.txt' model_path = './user_data/hfl-roberta-base' output_dir = './user_data/self-pretrained-bert-base-r2' tokenizer = BertTokenizer.from_pretrained(vocab_path) data = read_data(corpus_path, tokenizer) train_dataset = OppoDataset(data) model = BertForMaskedLM.from_pretrained(model_path) model.resize_token_embeddings(tokenizer.vocab_size) data_collator = Collator(max_seq_len=32, tokenizer=tokenizer, mlm_probability=0.15) logging_dir = os.path.join(output_dir, 'log') model_save_dir = os.path.join(output_dir, 'model_ckpt-1') tokenizer_and_config = os.path.join(output_dir, 'tokenizer_and_config') check_dir(model_save_dir) check_dir(logging_dir) check_dir(tokenizer_and_config) training_args = TrainingArguments( output_dir=output_dir, overwrite_output_dir=True, learning_rate=6e-5, num_train_epochs=130, # num_train_epochs=1, per_device_train_batch_size=64, logging_steps=2000, prediction_loss_only=True, load_best_model_at_end=True, logging_dir=logging_dir, logging_first_step=True, dataloader_num_workers=4, disable_tqdm=False, seed=2021) trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, ) trainer.train() trainer.save_model(model_save_dir) tokenizer.save_pretrained(tokenizer_and_config)
def test_set_level(self): logger = logging.get_logger() # the current default level is logging.WARNING level_origin = logging.get_verbosity() logging.set_verbosity_error() self.assertEqual(logger.getEffectiveLevel(), logging.get_verbosity()) logging.set_verbosity_warning() self.assertEqual(logger.getEffectiveLevel(), logging.get_verbosity()) logging.set_verbosity_info() self.assertEqual(logger.getEffectiveLevel(), logging.get_verbosity()) logging.set_verbosity_debug() self.assertEqual(logger.getEffectiveLevel(), logging.get_verbosity()) # restore to the original level logging.set_verbosity(level_origin)
def evaluate_gen_title(existing_run_name: str, existing_run_id: str, config_file: str, do_inference: bool, eval_model_file: str, test_file: str, test_sample_rate: float, out_dir: str, dataset_type: str, enable_bottleneck: bool = False, cluster_model_file: str = None, clustering_dist_threshold: float = 0.18, style_model_eval: bool = False, detokenize_after: bool = False, tokenize_after: bool = False): logging.set_verbosity_info() init_wandb(existing_run_name, None, existing_run_id) out_path_prefix = os.path.join( out_dir, eval_model_file[eval_model_file.index('checkpoint'):]) if out_path_prefix[-1] == '/': out_path_prefix = out_path_prefix[:-1] out_path_prefix += '-' if do_inference == '1': make_inference_and_save(config_file, eval_model_file, test_file, test_sample_rate, enable_bottleneck, cluster_model_file, clustering_dist_threshold, out_path_prefix, dataset_type, style_model_eval) evaluate_and_print_metrics(out_path_prefix + 'prediction.txt', out_path_prefix + 'gold.txt', detokenize_after=detokenize_after, tokenize_after=tokenize_after, is_multiple_ref=(cluster_model_file is not None), lower=True, are_clusters_used=(cluster_model_file is not None))
def __init__( self, pretrained_model_name_or_path: str, log_info: bool = False, use_gpu: bool = False, do_lower_case: bool = False, do_basic_tokenize: bool = True, strip_accents: bool = True ): if log_info: logging.set_verbosity_info() self.tokenizer = AutoTokenizer.from_pretrained( pretrained_model_name_or_path, do_lower_case=do_lower_case, do_basic_tokenize=do_basic_tokenize, strip_accents=strip_accents ) self.model = AutoModel.from_pretrained(pretrained_model_name_or_path) self.use_gpu = use_gpu and torch.cuda.is_available() if self.use_gpu: self.model.cuda() self.model.eval()
from transformers import ( ForCondGen, XLMForCondGen, logging, ) from transformers_old.modeling_prophetnet import ( ForCondGen as ForCondGenOld, ) from transformers_old.modeling_xlm_prophetnet import ( XLMForCondGen as XLMForCondGenOld, ) logger = logging.get_logger(__name__) logging.set_verbosity_info() def to_pytorch(src_path, save_path): if "xprophetnet" in src_path: prophet_old = XLMForCondGenOld.from_pretrained(src_path) prophet, loading_info = XLMForCondGen.from_pretrained(src_path, output_loading_info=True) else: prophet_old = ForCondGenOld.from_pretrained(src_path) prophet, loading_info = ForCondGen.from_pretrained(src_path, output_loading_info=True) special_keys = ["key_proj", "value_proj", "query_proj"] mapping = { "self_attn": "ngram_self_attn", "cross_attn": "encoder_attn", "cross_attn_layer_norm": "encoder_attn_layer_norm", "feed_forward_layer_norm": "final_layer_norm",
lmap, pickle_save, save_git_info, save_json, set_extra_model_params, Seq2SeqDataset, ) # need the parent dir module sys.path.insert(2, str(Path(__file__).resolve().parents[1])) logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) transformers_logging.set_verbosity_info() class AttrDict(dict): def __init__(self, *args, **kwargs): super(AttrDict, self).__init__(*args, **kwargs) self.__dict__ = self class GenerativeQAModule(BaseTransformer): mode = "generative_qa" loss_names = ["loss"] metric_names = ["em"] val_metric = "em" def __init__(self, hparams, **kwargs):
def train_gen_title(run_name: str, config_file: str, train_file: str, val_file: str, dataset_type: str, train_sample_rate: float, val_sample_rate: float, output_model_path: str, enable_bottleneck: bool = False, from_pretrained: str = None, checkpoint: str = None): logging.set_verbosity_info() config = json.loads(jsonnet_evaluate_file(config_file)) init_wandb(run_name, config) tokenizer_model_path = config["tokenizer_model_path"] tokenizer = BertTokenizer.from_pretrained(tokenizer_model_path, do_lower_case=False, do_basic_tokenize=False) max_tokens_text = config["max_tokens_text"] max_tokens_title = config["max_tokens_title"] print("Initializing model...") cls = BottleneckEncoderDecoderModel if enable_bottleneck else EncoderDecoderModel if from_pretrained: model = cls.from_pretrained(from_pretrained) else: enc_model_path = config["enc_model_path"] dec_model_path = config["dec_model_path"] model = cls.from_encoder_decoder_pretrained(enc_model_path, dec_model_path) model.cuda() if dataset_type == 'ria': print("Fetching RIA data...") train_records = [ r for r in tqdm.tqdm(ria_reader(train_file)) if random.random() <= train_sample_rate ] val_records = [ r for r in tqdm.tqdm(ria_reader(val_file)) if random.random() <= val_sample_rate ] print("Building datasets...") train_dataset = GenTitleDataset(train_records, tokenizer, max_tokens_text=max_tokens_text, max_tokens_title=max_tokens_title) val_dataset = GenTitleDataset(val_records, tokenizer, max_tokens_text=max_tokens_text, max_tokens_title=max_tokens_title) elif dataset_type == 'tg': print("Fetching TG data...") all_records = [ r for r in tqdm.tqdm(tg_reader(train_file)) if random.random() <= train_sample_rate ] print("Building datasets...") full_dataset = GenTitleDataset(all_records, tokenizer, max_tokens_text=max_tokens_text, max_tokens_title=max_tokens_title) train_size = int(0.995 * len(full_dataset)) train_dataset, val_dataset = torch.utils.data.random_split( full_dataset, [train_size, len(full_dataset) - train_size]) elif dataset_type == 'lenta-ria': print('Fetching Lenta-RIA data...') lenta_records = [ r for r in tqdm.tqdm( lenta_reader( os.path.join(train_file, 'lenta/lenta-ru-news.train.csv'))) ] lenta_records.extend([ r for r in tqdm.tqdm( lenta_reader( os.path.join(train_file, 'lenta/lenta-ru-news.val.csv'))) ]) ria_records = [ r for r in tqdm.tqdm( ria_reader( os.path.join(train_file, 'ria/ria.shuffled.train.json'))) ] ria_records.extend([ r for r in tqdm.tqdm( ria_reader( os.path.join(train_file, 'ria/ria.shuffled.val.json'))) ]) records = [ r for r in reader( '/home/aobuhtijarov/datasets/full_lenta_ria.test.jsonl') ] filter_lenta = [{ 'text': r['lenta_text'], 'title': r['lenta_title'], 'agency': 'lenta.ru', 'date': r['lenta_date'] } for r in records] filter_ria = [{ 'text': r['ria_text'], 'title': r['ria_title'], 'agency': 'РИА Новости', 'date': r['lenta_date'] } for r in records] lenta_filter_titles = set(x['title'] for x in filter_lenta) ria_filter_titles = set(x['title'] for x in filter_ria) lenta_records = [ r for r in lenta_records if r['title'] not in lenta_filter_titles ] ria_records = [ r for r in ria_records if r['title'] not in ria_filter_titles ] random.shuffle(ria_records) all_records = [r for r in lenta_records if r['date'][:4] in ['2010', '2011', '2012', '2013', '2014']] + \ ria_records[:300000] random.shuffle(all_records) print("Building datasets...") full_dataset = GenTitleDataset(all_records, tokenizer, max_tokens_text=max_tokens_text, max_tokens_title=max_tokens_title) train_size = int(0.99 * len(full_dataset)) train_dataset, val_dataset = torch.utils.data.random_split( full_dataset, [train_size, len(full_dataset) - train_size]) elif dataset_type == 'clusters': with open(train_file, 'r') as f: records = [json.loads(x.strip()) for x in f.readlines()] lenta_records = [{ 'title': x['lenta_title'], 'text': x['lenta_text'] } for x in records] ria_records = [{ 'title': x['ria_title'], 'text': x['ria_text'] } for x in records] n1 = int(0.98 * len(lenta_records)) n2 = int(0.98 * len(ria_records)) train_records = lenta_records[:n1] + ria_records[:n2] val_records = lenta_records[n1:] + ria_records[n2:] train_dataset = GenTitleDataset(train_records, tokenizer, max_tokens_text=max_tokens_text, max_tokens_title=max_tokens_title) val_dataset = GenTitleDataset(val_records, tokenizer, max_tokens_text=max_tokens_text, max_tokens_title=max_tokens_title) elif dataset_type == 'baseline-ria': with open(train_file, 'r') as f: records = [json.loads(x.strip()) for x in f.readlines()] ria_records = [{ 'title': x['ria_title'], 'text': x['ria_text'] } for x in records] train_records = ria_records[:int(0.97 * len(ria_records))] val_records = ria_records[int(0.97 * len(ria_records)):] train_dataset = GenTitleDataset(train_records, tokenizer, max_tokens_text=max_tokens_text, max_tokens_title=max_tokens_title) val_dataset = GenTitleDataset(val_records, tokenizer, max_tokens_text=max_tokens_text, max_tokens_title=max_tokens_title) elif dataset_type == 'baseline-lenta': with open(train_file, 'r') as f: records = [json.loads(x.strip()) for x in f.readlines()] lenta_records = [{ 'title': x['lenta_title'], 'text': x['lenta_text'] } for x in records] train_records = lenta_records[:int(0.97 * len(lenta_records))] val_records = lenta_records[int(0.97 * len(lenta_records)):] train_dataset = GenTitleDataset(train_records, tokenizer, max_tokens_text=max_tokens_text, max_tokens_title=max_tokens_title) val_dataset = GenTitleDataset(val_records, tokenizer, max_tokens_text=max_tokens_text, max_tokens_title=max_tokens_title) wandb.summary.update({ 'Train dataset size': len(train_dataset), 'Val dataset size': len(val_dataset) }) print("Training model...") batch_size = config["batch_size"] eval_steps = config["eval_steps"] save_steps = config["save_steps"] logging_steps = config["logging_steps"] enc_lr = config["enc_lr"] dec_lr = config["dec_lr"] warmup_steps = config["num_warmup_steps"] max_steps = config["max_steps"] gradient_accumulation_steps = config["gradient_accumulation_steps"] opt = get_separate_lr_optimizer(model, enc_lr, dec_lr, warmup_steps, max_steps) training_args = TrainingArguments( output_dir=output_model_path, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, gradient_accumulation_steps=gradient_accumulation_steps, evaluation_strategy='steps', do_train=True, do_eval=True, overwrite_output_dir=False, logging_steps=logging_steps, save_steps=save_steps, eval_steps=eval_steps, save_total_limit=1, max_steps=max_steps, report_to='wandb', ) trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset, optimizers=opt, ) trainer.train(checkpoint) model.save_pretrained(output_model_path)
def train_gen_title( config_file: str, train_file: str, val_file: str, train_sample_rate: float, val_sample_rate: float, output_model_path: str, enable_bottleneck: bool = False, from_pretrained: str = None, checkpoint: str = None ): train_file = get_true_file(train_file) val_file = get_true_file(val_file) assert train_file.endswith(".jsonl") assert val_file.endswith(".jsonl") logging.set_verbosity_info() config = json.loads(jsonnet_evaluate_file(config_file)) print("Fetching data...") train_records = [r for r in read_tg_jsonl(train_file) if random.random() <= train_sample_rate] val_records = [r for r in read_tg_jsonl(val_file) if random.random() <= val_sample_rate] print("Building datasets...") model_path = config.pop("model_path") tokenizer = AutoTokenizer.from_pretrained(model_path, do_lower_case=False) max_tokens_text = config.pop("max_tokens_text", 196) max_tokens_title = config.pop("max_tokens_title", 48) train_dataset = GenTitleDataset( train_records, tokenizer, max_tokens_text=max_tokens_text, max_tokens_title=max_tokens_title) val_dataset = GenTitleDataset( val_records, tokenizer, max_tokens_text=max_tokens_text, max_tokens_title=max_tokens_title) print("Initializing model...") cls = BottleneckEncoderDecoderModel if enable_bottleneck else EncoderDecoderModel if from_pretrained: model = cls.from_pretrained(from_pretrained) else: model = cls.from_encoder_decoder_pretrained(model_path, model_path) print("Training model...") batch_size = config.pop("batch_size", 8) eval_steps = config.pop("eval_steps", 10000) save_steps = config.pop("save_steps", 10000) logging_steps = config.pop("logging_steps", 100) learning_rate = config.pop("learning_rate", 5e-05) warmup_steps = config.pop("warmup_steps", 2000) num_train_epochs = config.pop("num_train_epochs", 5) training_args = TrainingArguments( output_dir=output_model_path, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, evaluate_during_training=True, do_train=True, do_eval=True, overwrite_output_dir=False, logging_steps=logging_steps, save_steps=save_steps, eval_steps=eval_steps, learning_rate=learning_rate, warmup_steps=warmup_steps, save_total_limit=1, num_train_epochs=num_train_epochs ) trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset, ) trainer.train(checkpoint) model.save_pretrained(output_model_path)
def train_style_gen_title( run_name: str, config_file: str, train_file: str, dataset_type: str, output_model_path: str, from_pretrained: str = None, checkpoint: str = None ): logging.set_verbosity_info() config = json.loads(jsonnet_evaluate_file(config_file)) init_wandb(run_name, config) agency_list = config['agency_list'] print('Agency list:', agency_list) tokenizer_model_path = config["tokenizer_model_path"] tokenizer = BertTokenizer.from_pretrained(tokenizer_model_path, do_lower_case=False, do_basic_tokenize=False) max_tokens_text = config["max_tokens_text"] max_tokens_title = config["max_tokens_title"] print("Initializing model...") if from_pretrained: model = EncoderDecoderModel.from_pretrained(from_pretrained) else: enc_model_path = config["enc_model_path"] dec_model_path = config["dec_model_path"] model = EncoderDecoderModel.from_encoder_decoder_pretrained(enc_model_path, dec_model_path) print("Fetching data...") if dataset_type == 'tg': all_records = [r for r in tqdm.tqdm(tg_reader(train_file))] elif dataset_type == 'lenta-ria': lenta_records = [r for r in tqdm.tqdm(lenta_reader(os.path.join(train_file, 'lenta/lenta-ru-news.train.csv')))] lenta_records.extend( [r for r in tqdm.tqdm(lenta_reader(os.path.join(train_file, 'lenta/lenta-ru-news.val.csv')))] ) ria_records = [r for r in tqdm.tqdm(ria_reader(os.path.join(train_file, 'ria/ria.shuffled.train.json')))] ria_records.extend( [r for r in tqdm.tqdm(ria_reader(os.path.join(train_file, 'ria/ria.shuffled.val.json')))] ) random.shuffle(ria_records) all_records = [r for r in lenta_records if r['date'][:4] in ['2010', '2011', '2012', '2013', '2014']] + \ ria_records[:220000] random.shuffle(all_records) print("Building datasets...") agency_to_special_token_id = {a: tokenizer.vocab[f'[unused{i+1}]'] for i, a in enumerate(agency_list)} full_dataset = AgencyTitleDatasetGeneration( all_records, tokenizer, filter_agencies=list(agency_to_special_token_id.keys()), agency_to_special_token_id=agency_to_special_token_id, max_tokens_text=max_tokens_text, max_tokens_title=max_tokens_title ) train_size = int(0.93 * len(full_dataset)) train_dataset, val_dataset = torch.utils.data.random_split(full_dataset, [train_size, len(full_dataset) - train_size]) print(f"Train dataset length = {len(train_dataset)}\nVal dataset length = {len(val_dataset)}") wandb.summary.update({ 'Train dataset size': len(train_dataset), 'Test dataset size': len(val_dataset), }) print("Training model...") batch_size = config["batch_size"] eval_steps = config["eval_steps"] save_steps = config["save_steps"] logging_steps = config["logging_steps"] enc_lr = config["enc_lr"] dec_lr = config["dec_lr"] warmup_steps = config["num_warmup_steps"] max_steps = config["max_steps"] gradient_accumulation_steps = config["gradient_accumulation_steps"] opt = get_separate_lr_optimizer(model, enc_lr, dec_lr, warmup_steps, max_steps) training_args = TrainingArguments( output_dir=output_model_path, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, gradient_accumulation_steps=gradient_accumulation_steps, evaluation_strategy='steps', do_train=True, do_eval=True, overwrite_output_dir=False, logging_steps=logging_steps, save_steps=save_steps, eval_steps=eval_steps, save_total_limit=2, max_steps=max_steps, report_to='wandb', ) trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset, optimizers=opt, ) trainer.train(checkpoint) model.save_pretrained(output_model_path)
parser.add_argument( "--eval_steps", type=int, default=500, help="If input should be tokenized to only lowercase", ) parser.add_argument( "--do_lowercase", action="store_true", help="If input should be lowercase or not when tokenizing", ) args = parser.parse_args() hf_logging.enable_default_handler() hf_logging.set_verbosity_info() hf_logging.enable_explicit_format() # Setup logging tb_writer = SummaryWriter(log_dir=args.logging_dir) logger = logging.getLogger("") logger.setLevel(logging.INFO) fh = logging.FileHandler(f"{args.logging_dir}.log") sh = logging.StreamHandler(sys.stdout) formatter = logging.Formatter( "[%(asctime)s], %(levelname)s %(message)s", datefmt="%a, %d %b %Y %H:%M:%S", ) fh.setFormatter(formatter)
def train_gen_title(run_name: str, config_file: str, train_file: str, train_fraq: float, output_model_path: str, from_pretrained: str = None, checkpoint: str = None): logging.set_verbosity_info() config = json.loads(jsonnet_evaluate_file(config_file)) init_wandb(run_name, config) tokenizer_model_path = config["tokenizer_model_path"] tokenizer = BertTokenizer.from_pretrained(tokenizer_model_path, do_lower_case=False, do_basic_tokenize=False) max_tokens_text = config["max_tokens_text"] max_tokens_title = config["max_tokens_title"] full_dataset = LentaRiaDataset(train_file, tokenizer, max_tokens_text, max_tokens_title) print("Initializing model...") if from_pretrained: model = EncoderDecoderModel.from_pretrained(from_pretrained) else: enc_model_path = config["enc_model_path"] dec_model_path = config["dec_model_path"] model = EncoderDecoderModel.from_encoder_decoder_pretrained( enc_model_path, dec_model_path) train_size = int(train_fraq * len(full_dataset)) train_dataset, val_dataset = \ torch.utils.data.random_split(full_dataset, [train_size, len(full_dataset) - train_size]) wandb.summary.update({ 'Train dataset size': len(train_dataset), 'Val dataset size': len(val_dataset), }) print("Training model...") batch_size = config["batch_size"] eval_steps = config["eval_steps"] save_steps = config["save_steps"] logging_steps = config["logging_steps"] enc_lr = config["enc_lr"] dec_lr = config["dec_lr"] warmup_steps = config["num_warmup_steps"] max_steps = config["max_steps"] gradient_accumulation_steps = config["gradient_accumulation_steps"] opt = get_separate_lr_optimizer(model, enc_lr, dec_lr, warmup_steps, max_steps) training_args = TrainingArguments( output_dir=output_model_path, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, gradient_accumulation_steps=gradient_accumulation_steps, evaluation_strategy='steps', do_train=True, do_eval=True, overwrite_output_dir=False, logging_steps=logging_steps, save_steps=save_steps, eval_steps=eval_steps, save_total_limit=1, max_steps=max_steps, report_to='wandb', ) trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset, optimizers=opt, ) trainer.train(checkpoint) model.save_pretrained(output_model_path)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) hf_logging.set_verbosity_info() if data_args.eval_data_file is None and training_args.do_eval: raise ValueError( "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " "or remove the --do_eval argument.") if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): logger.info( f"Output dir ({training_args.output_dir}) is not empty, will try to reload from there." ) model_args.model_name_or_path = training_args.output_dir # raise ValueError( # f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." # ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it," "and load it from here, using --tokenizer_name") if model_args.model_name_or_path: model = AutoModelWithLMHead.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) else: logger.info("Training new model from scratch") model = AutoModelWithLMHead.from_config(config) logger.info(model) num_params = sum(p.numel() for p in model.parameters()) logger.info('Model has %d parameters' % num_params) num_params = sum(p.numel() for p in model.parameters() if p.requires_grad) logger.info('Model has %d trainable parameters' % num_params) # ADD special tokens tokenizer.pad_token = tokenizer.eos_token special_tokens_dict = { 'additional_special_tokens': ['<STORY>', '<QUERY>', '<PROOF>', '<ANSWER>'] } # NOTE: should also have added "ent_1", "ent_2", ..., "ent_20" :/ num_added_toks = tokenizer.add_special_tokens(special_tokens_dict) logger.info(f'We have added {num_added_toks} tokens') ''' if tokenizer.pad_token_id is None and data_args.line_by_line: # See PR 3388. Some tokenizers don't had pad tokens which causes errors at the encoding step in the collate_fn. # We give here the option to force the addition of a pad token. The attention mask is used to ignore this token # when feeding to the model. # tokenizer.pad_token = tokenizer.eos_token num_added_toks = tokenizer.add_special_tokens({"pad_token": "<pad>"}) ''' model.resize_token_embeddings(len(tokenizer)) if config.model_type in ["bert", "roberta", "distilbert", "camembert" ] and not data_args.mlm: raise ValueError( "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the" "--mlm flag (masked language modeling).") if data_args.block_size <= 0: data_args.block_size = tokenizer.model_max_length # Our input block size will be the max possible for the model else: data_args.block_size = min(data_args.block_size, tokenizer.model_max_length) # Get datasets train_dataset = (get_dataset( data_args, tokenizer=tokenizer, cache_dir=model_args.cache_dir) if training_args.do_train else None) eval_dataset = (get_dataset(data_args, tokenizer=tokenizer, evaluate=True, cache_dir=model_args.cache_dir) if (training_args.do_eval or training_args.evaluate_during_training) else None) if config.model_type == "xlnet": data_collator = DataCollatorForPermutationLanguageModeling( tokenizer=tokenizer, plm_probability=data_args.plm_probability, max_span_length=data_args.max_span_length, ) else: data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=eval_dataset, tokenizer=tokenizer, ) # start by saving tokenizer so that we can restart training! # if trainer.is_world_master(): # tokenizer.save_pretrained(training_args.output_dir) results = {} # Training if training_args.do_train: model_path = (model_args.model_name_or_path if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path) else None) logger.info(f"model_path: {model_path}") if model_path is not None: # Grab the most recent checkpoint checkpoints_sorted = trainer._sorted_checkpoints(use_mtime=True) assert len(checkpoints_sorted) > 0 checkpoint_most_recent = checkpoints_sorted[-1] logger.info( f"most recent checkpoint: {checkpoint_most_recent}. setting model_path to this." ) # TODO: find a way to set: # - patience_best_eval_loss = None # - patience_evals_without_improvement = 0 # - patience_should_stop = False model_path = checkpoint_most_recent train_results = trainer.train(model_path=model_path, ) results["train_step"] = train_results.global_step results["train_loss"] = train_results.training_loss results["train_ppl"] = math.exp(train_results.training_loss) # trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) # if trainer.is_world_master(): # tokenizer.save_pretrained(training_args.output_dir) # Evaluation if training_args.do_eval: logger.info("*** Evaluate ***") eval_output = trainer.evaluate() results["valid_loss"] = eval_output["eval_loss"] results["valid_ppl"] = math.exp(eval_output["eval_loss"]) output_eval_file = os.path.join(training_args.output_dir, "results_lm.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** results *****") for key in sorted(results.keys()): logger.info(" %s = %s", key, str(results[key])) writer.write("%s = %s\n" % (key, str(results[key]))) return results
def train_discriminator( run_name: str, model_path: str, config_file: str, train_file: str, train_fraq: float, dataset_type: str, output_model_path: str, ): logging.set_verbosity_info() config = json.loads(jsonnet_evaluate_file(config_file)) init_wandb(run_name, config) agency_list = config['agency_list'] print('Agency list:', agency_list) max_tokens_text = config["max_tokens_text"] max_tokens_title = config["max_tokens_title"] tokenizer_model_path = config["tokenizer_model_path"] tokenizer = BertTokenizer.from_pretrained(tokenizer_model_path, do_lower_case=False, do_basic_tokenize=False) print("Fetching data...") if dataset_type == 'tg': all_records = [ r for r in tqdm.tqdm(tg_reader(train_file, agency_list)) ] full_dataset = AgencyTitleDatasetClassification( all_records, tokenizer, agency_list, max_tokens_text=max_tokens_text, max_tokens_title=max_tokens_title) elif dataset_type == 'lenta-ria': lenta_records = [ r for r in tqdm.tqdm( lenta_reader( os.path.join(train_file, 'lenta/lenta-ru-news.train.csv'))) ] lenta_records.extend([ r for r in tqdm.tqdm( lenta_reader( os.path.join(train_file, 'lenta/lenta-ru-news.val.csv'))) ]) ria_records = [ r for r in tqdm.tqdm( ria_reader( os.path.join(train_file, 'ria/ria.shuffled.train.json'))) ] ria_records.extend([ r for r in tqdm.tqdm( ria_reader( os.path.join(train_file, 'ria/ria.shuffled.val.json'))) ]) records = [ r for r in reader( '/home/aobuhtijarov/datasets/full_lenta_ria.test.jsonl') ] filter_lenta = [{ 'text': r['lenta_text'], 'title': r['lenta_title'], 'agency': 'lenta.ru', 'date': r['lenta_date'] } for r in records] filter_ria = [{ 'text': r['ria_text'], 'title': r['ria_title'], 'agency': 'РИА Новости', 'date': r['lenta_date'] } for r in records] lenta_filter_titles = set(x['title'] for x in filter_lenta) ria_filter_titles = set(x['title'] for x in filter_ria) lenta_records = [ r for r in lenta_records if r['title'] not in lenta_filter_titles ] ria_records = [ r for r in ria_records if r['title'] not in ria_filter_titles ] random.shuffle(ria_records) lenta_records = [ r for r in lenta_records if r['date'][:4] in ['2010', '2011', '2012', '2013', '2014'] ] all_records = lenta_records + ria_records[:len(lenta_records)] random.shuffle(all_records) full_dataset = AgencyTitleDatasetClassification( all_records, tokenizer, agency_list, max_tokens_text=max_tokens_text, max_tokens_title=max_tokens_title) elif dataset_type == 'lenta-ria-clusters': full_dataset = LentaRiaDatasetClassification(train_file, tokenizer, agency_list, max_tokens_text, max_tokens_title) print("Building datasets...") train_size = int(train_fraq * len(full_dataset)) test_size = int((1 - train_fraq) * 0.5 * len(full_dataset)) train_dataset, test_dataset, eval_dataset = \ torch.utils.data.random_split(full_dataset, [train_size, test_size, len(full_dataset) - train_size - test_size]) wandb.summary.update({ 'Train dataset size': len(train_dataset), 'Val dataset size': len(eval_dataset), 'Test dataset size': len(test_dataset), }) print("Initializing model...") model = AutoModelForSequenceClassification.from_pretrained( model_path, num_labels=len(agency_list)) print("Training model...") batch_size = config["batch_size"] logging_steps = config["logging_steps"] save_steps = config["save_steps"] eval_steps = config["eval_steps"] warmup_steps = config["num_warmup_steps"] gradient_accumulation_steps = config["gradient_accumulation_steps"] max_steps = config["max_steps"] lr = config["learning_rate"] training_args = TrainingArguments( output_dir=output_model_path, do_train=True, do_eval=True, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, gradient_accumulation_steps=gradient_accumulation_steps, evaluation_strategy='steps', learning_rate=lr, warmup_steps=warmup_steps, overwrite_output_dir=False, logging_steps=logging_steps, eval_steps=eval_steps, save_steps=save_steps, max_steps=max_steps, save_total_limit=1, weight_decay=0.01, report_to='wandb', ) trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics, ) trainer.train() wandb.summary.update( {'Test Evaluation': trainer.evaluate(eval_dataset=test_dataset)}) model.save_pretrained(output_model_path)
def evaluate_style_gen_title( existing_run_name: str, existing_run_id: str, config_file: str, gen_model_file: str, discr_model_file: str, test_file: str, test_sample_rate: float, ): logging.set_verbosity_info() init_wandb(existing_run_name, None, existing_run_id) config = json.loads(jsonnet_evaluate_file(config_file)) tokenizer_model_path = config["tokenizer_model_path"] tokenizer = BertTokenizer.from_pretrained(tokenizer_model_path, do_lower_case=False, do_basic_tokenize=False) max_tokens_text = config["max_tokens_text"] max_tokens_title = config["max_tokens_title"] setattr(tokenizer, 'max_tokens_text', max_tokens_text) batch_size = config["batch_size"] print("Loading model...") model = EncoderDecoderModel.from_pretrained(gen_model_file) model.eval() model.cuda() agency_list = config['agency_list'] discriminator = AutoModelForSequenceClassification.from_pretrained(discr_model_file, num_labels=len(agency_list)).cuda() print("Fetching TG data...") test_records = [r for r in tqdm.tqdm(tg_reader(test_file)) if random.random() <= test_sample_rate] print("Building datasets...") agency_to_special_token_id = { a: tokenizer.vocab[f'[unused{i+1}]'] for i, a in enumerate(agency_list) } agency_to_target = {a: i for i, a in enumerate(sorted(agency_list))} test_dataset = AgencyTitleDatasetGeneration( test_records, tokenizer, filter_agencies=list(agency_to_special_token_id.keys()), agency_to_special_token_id=agency_to_special_token_id, max_tokens_text=max_tokens_text, max_tokens_title=max_tokens_title ) print('Dataset size:', len(test_dataset)) y_pred = [] y_true = [] for i in tqdm.trange(0, len(test_dataset), batch_size): data = test_dataset[i] for k in tuple(data.keys()): if k not in ('input_ids', 'attention_mask'): del data[k] else: data[k] = data[k].unsqueeze(0) for j in range(i + 1, min(i + batch_size, len(test_dataset))): for k in data.keys(): data[k] = torch.cat((data[k], test_dataset[j][k].unsqueeze(0)), dim=0) y_true.extend([ agency_to_target[test_dataset.get_strings(j)['agency']] for j in range(i, min(i + batch_size, len(test_dataset)))]) data['input_ids'] = data['input_ids'].cuda() data['attention_mask'] = data['attention_mask'].cuda() output_ids = model.generate( **data, decoder_start_token_id=model.config.decoder.pad_token_id, min_length=7, max_length=20, num_beams=6 ) preds = [ tokenizer.decode(first_sent(x, tokenizer.sep_token_id), skip_special_tokens=True) for x in output_ids ] for title in preds: inp = tokenizer(title, add_special_tokens=True, max_length=max_tokens_title, padding='max_length', truncation=True ) logits = discriminator(input_ids=torch.LongTensor(inp['input_ids']).cuda().unsqueeze(0), attention_mask=torch.LongTensor(inp['attention_mask']).cuda().unsqueeze(0))[0] y_pred.append(torch.argmax(logits).item()) wandb.summary.update({ 'D-Style': classification_report(y_true, y_pred, output_dict=True) })
def perform_clustering_eval( existing_run_name: str, existing_run_id: str, config_file, eval_model_file, clustering_data_file, gold_markup_file, enable_bottleneck, text_to_vec_func ): logging.set_verbosity_info() init_wandb(existing_run_name, None, existing_run_id) config = json.loads(jsonnet_evaluate_file(config_file)) tokenizer_model_path = config["tokenizer_model_path"] tokenizer = BertTokenizer.from_pretrained(tokenizer_model_path, do_lower_case=False, do_basic_tokenize=False) max_tokens_text = config["max_tokens_text"] print("Loading model...") cls = BottleneckEncoderDecoderModel if enable_bottleneck else EncoderDecoderModel model = cls.from_pretrained(eval_model_file) model.eval() model.cuda() gold_markup = get_gold_markup(gold_markup_file) url2record, filename2url = get_data_to_cluster(clustering_data_file) setattr(tokenizer, 'max_tokens_text', max_tokens_text) text_to_vector_func = get_text_to_vector_func(text_to_vec_func, model, tokenizer) print('Calculating embeddings...') embeds = np.zeros((len(url2record.items()), 768)) total_articles = len(url2record.items()) for i, (url, record) in tqdm.tqdm(enumerate(url2record.items()), total=total_articles): text = record["title"] + ' ' + record["text"] text = text.lower().replace('\xa0', ' ').strip() embeds[i] = text_to_vector_func(text).detach().cpu().numpy().ravel() print('Embeds shape =', embeds.shape) print('Searching for optimal threshold') domain = np.logspace(-3, 0, 11) quals = [get_quality(embeds, gold_markup, url2record, dist) for dist in tqdm.tqdm(domain, total=11)] closer_domain = np.linspace( domain[max(0, np.argmax(quals) - 2)], domain[min(np.argmax(quals) + 3, len(domain) - 1)], 9) closer_quals = [get_quality(embeds, gold_markup, url2record, dist) for dist in tqdm.tqdm(closer_domain, total=9)] best_dist = closer_domain[np.argmax(closer_quals)] print('Best distance:', best_dist) get_quality(embeds, gold_markup, url2record, best_dist, print_result=True) log_to_wandb(embeds, gold_markup, best_dist, url2record, text_to_vec_func)