def __init__(self, db_session, config_path: str): self.db_session = db_session self.config = json.loads(jsonnet_evaluate_file(config_path)) self.processors = dict() for key, item in self.config["processors"].items(): item_type = item.pop("type") self.processors[key] = Processor.by_name(item_type)(**item) print("'{}' processor loaded".format(key))
def __init__(self, config_path): self.config = json.loads(jsonnet_evaluate_file(config_path)) self.lang_detect_model_path = self.config["lang_detect_model_path"] self.cat_detect_model_path = self.config["cat_detect_model_path"] self.max_tokens = self.config.get("max_tokens") self.is_lower = self.config["is_lower"] self.languages = self.config.get("languages", ["ru", "en"]) self.is_news_only = self.config.get("is_news_only", False) assert os.path.exists( self.lang_detect_model_path), "No language detection model found" assert os.path.exists( self.cat_detect_model_path), "No category detection model found" self.lang_detect_model = ft_load_model(self.lang_detect_model_path) self.cat_detect_model = ft_load_model(self.cat_detect_model_path) self.tokenizer = Tokenizer("conservative", joiner_annotate=False)
def __init__(self, db_session, config_path: str): self.config = json.loads(jsonnet_evaluate_file(config_path)) self.db_session = db_session self.vectors = None self.num2doc = list() self.num2entities = list() self.num2keywords = list() self.num2host = list() self.num2timestamp = list() self.doc_count = 0 self.id2num = dict() self.keyword2nums = defaultdict(list) self.distances = None self.labels = dict() self.clusters = defaultdict(list)
def train_tfidf(config_file, input_file, output_file, svd_matrix_file): config = json.loads(jsonnet_evaluate_file(config_file)) input_file = get_true_file(input_file) assert input_file.endswith(".jsonl") print("Parsing input data...") corpus = [] for record in tqdm(read_tg_jsonl(input_file)): corpus.append(record.pop("title") + " " + record.pop("text")) idfs = build_idf_vocabulary(corpus, **config.pop("building")) print("Saving vocabulary with IDFs...") with open(output_file, "w") as w: for word, idf in idfs: w.write("{}\t{}\n".format(word, idf)) word2idf = {word: idf for word, idf in idfs} word2idx = {word: idx for idx, (word, _) in enumerate(idfs)} print("Preparing CSR martix...") X_data = [] X_col_ind = [] X_row_ind = [] for i, text in enumerate(corpus): data, col_ind = get_tfidf_vector(text, word2idf, word2idx) row_ind = [i for _ in range(len(col_ind))] X_data += data X_col_ind += col_ind X_row_ind += row_ind X = csr_matrix((X_data, (X_row_ind, X_col_ind))) print("Calculating truncated SVD...") svd_dim = config.pop("svd_dim") svd = TruncatedSVD(n_components=svd_dim, n_iter=100, random_state=42) svd.fit(X) matrix = svd.components_.T model = SVDEmbedder(len(word2idf), svd_dim) model.mapping_layer.weight.data = torch.DoubleTensor(matrix).transpose( 0, 1) torch.save(model, svd_matrix_file)
import wandb # In[ ]: from _jsonnet import evaluate_file as jsonnet_evaluate_file from transformers import BertTokenizer, EncoderDecoderModel, Trainer, TrainingArguments, logging from readers import tg_reader, lenta_reader, ria_reader from custom_datasets import FullStyleDataset from utils.training_utils import get_separate_lr_optimizer, init_wandb # In[ ]: logging.set_verbosity_info() config = json.loads( jsonnet_evaluate_file( '/home/aobuhtijarov/master-thesis/configs/gen_title.jsonnet')) init_wandb('full-style', config) agency_list = ["РИА Новости", "lenta.ru"] print('Agency list:', agency_list) # In[ ]: tokenizer_model_path = config["tokenizer_model_path"] tokenizer = BertTokenizer.from_pretrained(tokenizer_model_path, do_lower_case=False, do_basic_tokenize=False) max_tokens_text = config["max_tokens_text"] max_tokens_title = config["max_tokens_title"]
def train_gen_title(run_name: str, config_file: str, train_file: str, val_file: str, dataset_type: str, train_sample_rate: float, val_sample_rate: float, output_model_path: str, enable_bottleneck: bool = False, from_pretrained: str = None, checkpoint: str = None): logging.set_verbosity_info() config = json.loads(jsonnet_evaluate_file(config_file)) init_wandb(run_name, config) tokenizer_model_path = config["tokenizer_model_path"] tokenizer = BertTokenizer.from_pretrained(tokenizer_model_path, do_lower_case=False, do_basic_tokenize=False) max_tokens_text = config["max_tokens_text"] max_tokens_title = config["max_tokens_title"] print("Initializing model...") cls = BottleneckEncoderDecoderModel if enable_bottleneck else EncoderDecoderModel if from_pretrained: model = cls.from_pretrained(from_pretrained) else: enc_model_path = config["enc_model_path"] dec_model_path = config["dec_model_path"] model = cls.from_encoder_decoder_pretrained(enc_model_path, dec_model_path) model.cuda() if dataset_type == 'ria': print("Fetching RIA data...") train_records = [ r for r in tqdm.tqdm(ria_reader(train_file)) if random.random() <= train_sample_rate ] val_records = [ r for r in tqdm.tqdm(ria_reader(val_file)) if random.random() <= val_sample_rate ] print("Building datasets...") train_dataset = GenTitleDataset(train_records, tokenizer, max_tokens_text=max_tokens_text, max_tokens_title=max_tokens_title) val_dataset = GenTitleDataset(val_records, tokenizer, max_tokens_text=max_tokens_text, max_tokens_title=max_tokens_title) elif dataset_type == 'tg': print("Fetching TG data...") all_records = [ r for r in tqdm.tqdm(tg_reader(train_file)) if random.random() <= train_sample_rate ] print("Building datasets...") full_dataset = GenTitleDataset(all_records, tokenizer, max_tokens_text=max_tokens_text, max_tokens_title=max_tokens_title) train_size = int(0.995 * len(full_dataset)) train_dataset, val_dataset = torch.utils.data.random_split( full_dataset, [train_size, len(full_dataset) - train_size]) elif dataset_type == 'lenta-ria': print('Fetching Lenta-RIA data...') lenta_records = [ r for r in tqdm.tqdm( lenta_reader( os.path.join(train_file, 'lenta/lenta-ru-news.train.csv'))) ] lenta_records.extend([ r for r in tqdm.tqdm( lenta_reader( os.path.join(train_file, 'lenta/lenta-ru-news.val.csv'))) ]) ria_records = [ r for r in tqdm.tqdm( ria_reader( os.path.join(train_file, 'ria/ria.shuffled.train.json'))) ] ria_records.extend([ r for r in tqdm.tqdm( ria_reader( os.path.join(train_file, 'ria/ria.shuffled.val.json'))) ]) records = [ r for r in reader( '/home/aobuhtijarov/datasets/full_lenta_ria.test.jsonl') ] filter_lenta = [{ 'text': r['lenta_text'], 'title': r['lenta_title'], 'agency': 'lenta.ru', 'date': r['lenta_date'] } for r in records] filter_ria = [{ 'text': r['ria_text'], 'title': r['ria_title'], 'agency': 'РИА Новости', 'date': r['lenta_date'] } for r in records] lenta_filter_titles = set(x['title'] for x in filter_lenta) ria_filter_titles = set(x['title'] for x in filter_ria) lenta_records = [ r for r in lenta_records if r['title'] not in lenta_filter_titles ] ria_records = [ r for r in ria_records if r['title'] not in ria_filter_titles ] random.shuffle(ria_records) all_records = [r for r in lenta_records if r['date'][:4] in ['2010', '2011', '2012', '2013', '2014']] + \ ria_records[:300000] random.shuffle(all_records) print("Building datasets...") full_dataset = GenTitleDataset(all_records, tokenizer, max_tokens_text=max_tokens_text, max_tokens_title=max_tokens_title) train_size = int(0.99 * len(full_dataset)) train_dataset, val_dataset = torch.utils.data.random_split( full_dataset, [train_size, len(full_dataset) - train_size]) elif dataset_type == 'clusters': with open(train_file, 'r') as f: records = [json.loads(x.strip()) for x in f.readlines()] lenta_records = [{ 'title': x['lenta_title'], 'text': x['lenta_text'] } for x in records] ria_records = [{ 'title': x['ria_title'], 'text': x['ria_text'] } for x in records] n1 = int(0.98 * len(lenta_records)) n2 = int(0.98 * len(ria_records)) train_records = lenta_records[:n1] + ria_records[:n2] val_records = lenta_records[n1:] + ria_records[n2:] train_dataset = GenTitleDataset(train_records, tokenizer, max_tokens_text=max_tokens_text, max_tokens_title=max_tokens_title) val_dataset = GenTitleDataset(val_records, tokenizer, max_tokens_text=max_tokens_text, max_tokens_title=max_tokens_title) elif dataset_type == 'baseline-ria': with open(train_file, 'r') as f: records = [json.loads(x.strip()) for x in f.readlines()] ria_records = [{ 'title': x['ria_title'], 'text': x['ria_text'] } for x in records] train_records = ria_records[:int(0.97 * len(ria_records))] val_records = ria_records[int(0.97 * len(ria_records)):] train_dataset = GenTitleDataset(train_records, tokenizer, max_tokens_text=max_tokens_text, max_tokens_title=max_tokens_title) val_dataset = GenTitleDataset(val_records, tokenizer, max_tokens_text=max_tokens_text, max_tokens_title=max_tokens_title) elif dataset_type == 'baseline-lenta': with open(train_file, 'r') as f: records = [json.loads(x.strip()) for x in f.readlines()] lenta_records = [{ 'title': x['lenta_title'], 'text': x['lenta_text'] } for x in records] train_records = lenta_records[:int(0.97 * len(lenta_records))] val_records = lenta_records[int(0.97 * len(lenta_records)):] train_dataset = GenTitleDataset(train_records, tokenizer, max_tokens_text=max_tokens_text, max_tokens_title=max_tokens_title) val_dataset = GenTitleDataset(val_records, tokenizer, max_tokens_text=max_tokens_text, max_tokens_title=max_tokens_title) wandb.summary.update({ 'Train dataset size': len(train_dataset), 'Val dataset size': len(val_dataset) }) print("Training model...") batch_size = config["batch_size"] eval_steps = config["eval_steps"] save_steps = config["save_steps"] logging_steps = config["logging_steps"] enc_lr = config["enc_lr"] dec_lr = config["dec_lr"] warmup_steps = config["num_warmup_steps"] max_steps = config["max_steps"] gradient_accumulation_steps = config["gradient_accumulation_steps"] opt = get_separate_lr_optimizer(model, enc_lr, dec_lr, warmup_steps, max_steps) training_args = TrainingArguments( output_dir=output_model_path, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, gradient_accumulation_steps=gradient_accumulation_steps, evaluation_strategy='steps', do_train=True, do_eval=True, overwrite_output_dir=False, logging_steps=logging_steps, save_steps=save_steps, eval_steps=eval_steps, save_total_limit=1, max_steps=max_steps, report_to='wandb', ) trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset, optimizers=opt, ) trainer.train(checkpoint) model.save_pretrained(output_model_path)
def train_gen_title( config_file: str, train_file: str, val_file: str, train_sample_rate: float, val_sample_rate: float, output_model_path: str, enable_bottleneck: bool = False, from_pretrained: str = None, checkpoint: str = None ): train_file = get_true_file(train_file) val_file = get_true_file(val_file) assert train_file.endswith(".jsonl") assert val_file.endswith(".jsonl") logging.set_verbosity_info() config = json.loads(jsonnet_evaluate_file(config_file)) print("Fetching data...") train_records = [r for r in read_tg_jsonl(train_file) if random.random() <= train_sample_rate] val_records = [r for r in read_tg_jsonl(val_file) if random.random() <= val_sample_rate] print("Building datasets...") model_path = config.pop("model_path") tokenizer = AutoTokenizer.from_pretrained(model_path, do_lower_case=False) max_tokens_text = config.pop("max_tokens_text", 196) max_tokens_title = config.pop("max_tokens_title", 48) train_dataset = GenTitleDataset( train_records, tokenizer, max_tokens_text=max_tokens_text, max_tokens_title=max_tokens_title) val_dataset = GenTitleDataset( val_records, tokenizer, max_tokens_text=max_tokens_text, max_tokens_title=max_tokens_title) print("Initializing model...") cls = BottleneckEncoderDecoderModel if enable_bottleneck else EncoderDecoderModel if from_pretrained: model = cls.from_pretrained(from_pretrained) else: model = cls.from_encoder_decoder_pretrained(model_path, model_path) print("Training model...") batch_size = config.pop("batch_size", 8) eval_steps = config.pop("eval_steps", 10000) save_steps = config.pop("save_steps", 10000) logging_steps = config.pop("logging_steps", 100) learning_rate = config.pop("learning_rate", 5e-05) warmup_steps = config.pop("warmup_steps", 2000) num_train_epochs = config.pop("num_train_epochs", 5) training_args = TrainingArguments( output_dir=output_model_path, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, evaluate_during_training=True, do_train=True, do_eval=True, overwrite_output_dir=False, logging_steps=logging_steps, save_steps=save_steps, eval_steps=eval_steps, learning_rate=learning_rate, warmup_steps=warmup_steps, save_total_limit=1, num_train_epochs=num_train_epochs ) trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset, ) trainer.train(checkpoint) model.save_pretrained(output_model_path)
def train_style_gen_title( run_name: str, config_file: str, train_file: str, dataset_type: str, output_model_path: str, from_pretrained: str = None, checkpoint: str = None ): logging.set_verbosity_info() config = json.loads(jsonnet_evaluate_file(config_file)) init_wandb(run_name, config) agency_list = config['agency_list'] print('Agency list:', agency_list) tokenizer_model_path = config["tokenizer_model_path"] tokenizer = BertTokenizer.from_pretrained(tokenizer_model_path, do_lower_case=False, do_basic_tokenize=False) max_tokens_text = config["max_tokens_text"] max_tokens_title = config["max_tokens_title"] print("Initializing model...") if from_pretrained: model = EncoderDecoderModel.from_pretrained(from_pretrained) else: enc_model_path = config["enc_model_path"] dec_model_path = config["dec_model_path"] model = EncoderDecoderModel.from_encoder_decoder_pretrained(enc_model_path, dec_model_path) print("Fetching data...") if dataset_type == 'tg': all_records = [r for r in tqdm.tqdm(tg_reader(train_file))] elif dataset_type == 'lenta-ria': lenta_records = [r for r in tqdm.tqdm(lenta_reader(os.path.join(train_file, 'lenta/lenta-ru-news.train.csv')))] lenta_records.extend( [r for r in tqdm.tqdm(lenta_reader(os.path.join(train_file, 'lenta/lenta-ru-news.val.csv')))] ) ria_records = [r for r in tqdm.tqdm(ria_reader(os.path.join(train_file, 'ria/ria.shuffled.train.json')))] ria_records.extend( [r for r in tqdm.tqdm(ria_reader(os.path.join(train_file, 'ria/ria.shuffled.val.json')))] ) random.shuffle(ria_records) all_records = [r for r in lenta_records if r['date'][:4] in ['2010', '2011', '2012', '2013', '2014']] + \ ria_records[:220000] random.shuffle(all_records) print("Building datasets...") agency_to_special_token_id = {a: tokenizer.vocab[f'[unused{i+1}]'] for i, a in enumerate(agency_list)} full_dataset = AgencyTitleDatasetGeneration( all_records, tokenizer, filter_agencies=list(agency_to_special_token_id.keys()), agency_to_special_token_id=agency_to_special_token_id, max_tokens_text=max_tokens_text, max_tokens_title=max_tokens_title ) train_size = int(0.93 * len(full_dataset)) train_dataset, val_dataset = torch.utils.data.random_split(full_dataset, [train_size, len(full_dataset) - train_size]) print(f"Train dataset length = {len(train_dataset)}\nVal dataset length = {len(val_dataset)}") wandb.summary.update({ 'Train dataset size': len(train_dataset), 'Test dataset size': len(val_dataset), }) print("Training model...") batch_size = config["batch_size"] eval_steps = config["eval_steps"] save_steps = config["save_steps"] logging_steps = config["logging_steps"] enc_lr = config["enc_lr"] dec_lr = config["dec_lr"] warmup_steps = config["num_warmup_steps"] max_steps = config["max_steps"] gradient_accumulation_steps = config["gradient_accumulation_steps"] opt = get_separate_lr_optimizer(model, enc_lr, dec_lr, warmup_steps, max_steps) training_args = TrainingArguments( output_dir=output_model_path, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, gradient_accumulation_steps=gradient_accumulation_steps, evaluation_strategy='steps', do_train=True, do_eval=True, overwrite_output_dir=False, logging_steps=logging_steps, save_steps=save_steps, eval_steps=eval_steps, save_total_limit=2, max_steps=max_steps, report_to='wandb', ) trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset, optimizers=opt, ) trainer.train(checkpoint) model.save_pretrained(output_model_path)
def train_text2title(config_file: str, train_file: str, val_file: str, train_sample_rate: float, val_sample_rate: float, output_title_model_path: str, output_text_model_path: str, random_seed: int, neptune_project: str): seed_everything(random_seed) train_file = get_true_file(train_file) val_file = get_true_file(val_file) assert train_file.endswith(".jsonl") assert val_file.endswith(".jsonl") config = json.loads(jsonnet_evaluate_file(config_file)) print("Loading vectors...") ft_model_path = config.pop("ft_vector_model_path", "models/fasttext/ru_vectors_v3.bin") ft_model = ft_load_model(ft_model_path) print("Fetching data...") train_records = [ r for r in read_tg_jsonl(train_file) if random.random() <= train_sample_rate ] val_records = [ r for r in read_tg_jsonl(val_file) if random.random() <= val_sample_rate ] print("Building datasets...") max_words = config.get("max_words", 150) batch_size = config.get("batch_size", 64) num_workers = config.get("num_workers", 5) train_data = Text2TitleDataset(train_records, ft_model, max_words=max_words) train_sampler = RandomSampler(train_data) train_loader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size, num_workers=num_workers) val_data = Text2TitleDataset(val_records, ft_model, max_words=max_words) val_loader = DataLoader(val_data, batch_size=batch_size, num_workers=num_workers) print("Training model...") epochs = config.get("epochs", 100) patience = config.get("patience", 4) model = Text2TitleModel() early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=0.0, patience=patience, verbose=True, mode="min") logger = False neptune_api_token = os.getenv("NEPTUNE_API_TOKEN") if neptune_project and neptune_api_token: params = copy.copy(config) params["train_sample_rate"] = train_sample_rate params["val_sample_rate"] = val_sample_rate params["train_file"] = train_file params["val_file"] = val_file logger = NeptuneLogger( api_key=neptune_api_token, project_name=neptune_project, experiment_name="Fasttext text2title", tags=["training", "pytorch-lightning", "text2title"], params=params) trainer = Trainer(gpus=0, checkpoint_callback=False, accumulate_grad_batches=1, max_epochs=epochs, callbacks=[early_stop_callback], val_check_interval=1.0, progress_bar_refresh_rate=100, deterministic=True, logger=logger) trainer.fit(model, train_loader, val_loader) model.save(output_title_model_path, output_text_model_path)
def train_gen_title(run_name: str, config_file: str, train_file: str, train_fraq: float, output_model_path: str, from_pretrained: str = None, checkpoint: str = None): logging.set_verbosity_info() config = json.loads(jsonnet_evaluate_file(config_file)) init_wandb(run_name, config) tokenizer_model_path = config["tokenizer_model_path"] tokenizer = BertTokenizer.from_pretrained(tokenizer_model_path, do_lower_case=False, do_basic_tokenize=False) max_tokens_text = config["max_tokens_text"] max_tokens_title = config["max_tokens_title"] full_dataset = LentaRiaDataset(train_file, tokenizer, max_tokens_text, max_tokens_title) print("Initializing model...") if from_pretrained: model = EncoderDecoderModel.from_pretrained(from_pretrained) else: enc_model_path = config["enc_model_path"] dec_model_path = config["dec_model_path"] model = EncoderDecoderModel.from_encoder_decoder_pretrained( enc_model_path, dec_model_path) train_size = int(train_fraq * len(full_dataset)) train_dataset, val_dataset = \ torch.utils.data.random_split(full_dataset, [train_size, len(full_dataset) - train_size]) wandb.summary.update({ 'Train dataset size': len(train_dataset), 'Val dataset size': len(val_dataset), }) print("Training model...") batch_size = config["batch_size"] eval_steps = config["eval_steps"] save_steps = config["save_steps"] logging_steps = config["logging_steps"] enc_lr = config["enc_lr"] dec_lr = config["dec_lr"] warmup_steps = config["num_warmup_steps"] max_steps = config["max_steps"] gradient_accumulation_steps = config["gradient_accumulation_steps"] opt = get_separate_lr_optimizer(model, enc_lr, dec_lr, warmup_steps, max_steps) training_args = TrainingArguments( output_dir=output_model_path, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, gradient_accumulation_steps=gradient_accumulation_steps, evaluation_strategy='steps', do_train=True, do_eval=True, overwrite_output_dir=False, logging_steps=logging_steps, save_steps=save_steps, eval_steps=eval_steps, save_total_limit=1, max_steps=max_steps, report_to='wandb', ) trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset, optimizers=opt, ) trainer.train(checkpoint) model.save_pretrained(output_model_path)
def train_discriminator( run_name: str, model_path: str, config_file: str, train_file: str, train_fraq: float, dataset_type: str, output_model_path: str, ): logging.set_verbosity_info() config = json.loads(jsonnet_evaluate_file(config_file)) init_wandb(run_name, config) agency_list = config['agency_list'] print('Agency list:', agency_list) max_tokens_text = config["max_tokens_text"] max_tokens_title = config["max_tokens_title"] tokenizer_model_path = config["tokenizer_model_path"] tokenizer = BertTokenizer.from_pretrained(tokenizer_model_path, do_lower_case=False, do_basic_tokenize=False) print("Fetching data...") if dataset_type == 'tg': all_records = [ r for r in tqdm.tqdm(tg_reader(train_file, agency_list)) ] full_dataset = AgencyTitleDatasetClassification( all_records, tokenizer, agency_list, max_tokens_text=max_tokens_text, max_tokens_title=max_tokens_title) elif dataset_type == 'lenta-ria': lenta_records = [ r for r in tqdm.tqdm( lenta_reader( os.path.join(train_file, 'lenta/lenta-ru-news.train.csv'))) ] lenta_records.extend([ r for r in tqdm.tqdm( lenta_reader( os.path.join(train_file, 'lenta/lenta-ru-news.val.csv'))) ]) ria_records = [ r for r in tqdm.tqdm( ria_reader( os.path.join(train_file, 'ria/ria.shuffled.train.json'))) ] ria_records.extend([ r for r in tqdm.tqdm( ria_reader( os.path.join(train_file, 'ria/ria.shuffled.val.json'))) ]) records = [ r for r in reader( '/home/aobuhtijarov/datasets/full_lenta_ria.test.jsonl') ] filter_lenta = [{ 'text': r['lenta_text'], 'title': r['lenta_title'], 'agency': 'lenta.ru', 'date': r['lenta_date'] } for r in records] filter_ria = [{ 'text': r['ria_text'], 'title': r['ria_title'], 'agency': 'РИА Новости', 'date': r['lenta_date'] } for r in records] lenta_filter_titles = set(x['title'] for x in filter_lenta) ria_filter_titles = set(x['title'] for x in filter_ria) lenta_records = [ r for r in lenta_records if r['title'] not in lenta_filter_titles ] ria_records = [ r for r in ria_records if r['title'] not in ria_filter_titles ] random.shuffle(ria_records) lenta_records = [ r for r in lenta_records if r['date'][:4] in ['2010', '2011', '2012', '2013', '2014'] ] all_records = lenta_records + ria_records[:len(lenta_records)] random.shuffle(all_records) full_dataset = AgencyTitleDatasetClassification( all_records, tokenizer, agency_list, max_tokens_text=max_tokens_text, max_tokens_title=max_tokens_title) elif dataset_type == 'lenta-ria-clusters': full_dataset = LentaRiaDatasetClassification(train_file, tokenizer, agency_list, max_tokens_text, max_tokens_title) print("Building datasets...") train_size = int(train_fraq * len(full_dataset)) test_size = int((1 - train_fraq) * 0.5 * len(full_dataset)) train_dataset, test_dataset, eval_dataset = \ torch.utils.data.random_split(full_dataset, [train_size, test_size, len(full_dataset) - train_size - test_size]) wandb.summary.update({ 'Train dataset size': len(train_dataset), 'Val dataset size': len(eval_dataset), 'Test dataset size': len(test_dataset), }) print("Initializing model...") model = AutoModelForSequenceClassification.from_pretrained( model_path, num_labels=len(agency_list)) print("Training model...") batch_size = config["batch_size"] logging_steps = config["logging_steps"] save_steps = config["save_steps"] eval_steps = config["eval_steps"] warmup_steps = config["num_warmup_steps"] gradient_accumulation_steps = config["gradient_accumulation_steps"] max_steps = config["max_steps"] lr = config["learning_rate"] training_args = TrainingArguments( output_dir=output_model_path, do_train=True, do_eval=True, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, gradient_accumulation_steps=gradient_accumulation_steps, evaluation_strategy='steps', learning_rate=lr, warmup_steps=warmup_steps, overwrite_output_dir=False, logging_steps=logging_steps, eval_steps=eval_steps, save_steps=save_steps, max_steps=max_steps, save_total_limit=1, weight_decay=0.01, report_to='wandb', ) trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics, ) trainer.train() wandb.summary.update( {'Test Evaluation': trainer.evaluate(eval_dataset=test_dataset)}) model.save_pretrained(output_model_path)
def evaluate_style_gen_title( existing_run_name: str, existing_run_id: str, config_file: str, gen_model_file: str, discr_model_file: str, test_file: str, test_sample_rate: float, ): logging.set_verbosity_info() init_wandb(existing_run_name, None, existing_run_id) config = json.loads(jsonnet_evaluate_file(config_file)) tokenizer_model_path = config["tokenizer_model_path"] tokenizer = BertTokenizer.from_pretrained(tokenizer_model_path, do_lower_case=False, do_basic_tokenize=False) max_tokens_text = config["max_tokens_text"] max_tokens_title = config["max_tokens_title"] setattr(tokenizer, 'max_tokens_text', max_tokens_text) batch_size = config["batch_size"] print("Loading model...") model = EncoderDecoderModel.from_pretrained(gen_model_file) model.eval() model.cuda() agency_list = config['agency_list'] discriminator = AutoModelForSequenceClassification.from_pretrained(discr_model_file, num_labels=len(agency_list)).cuda() print("Fetching TG data...") test_records = [r for r in tqdm.tqdm(tg_reader(test_file)) if random.random() <= test_sample_rate] print("Building datasets...") agency_to_special_token_id = { a: tokenizer.vocab[f'[unused{i+1}]'] for i, a in enumerate(agency_list) } agency_to_target = {a: i for i, a in enumerate(sorted(agency_list))} test_dataset = AgencyTitleDatasetGeneration( test_records, tokenizer, filter_agencies=list(agency_to_special_token_id.keys()), agency_to_special_token_id=agency_to_special_token_id, max_tokens_text=max_tokens_text, max_tokens_title=max_tokens_title ) print('Dataset size:', len(test_dataset)) y_pred = [] y_true = [] for i in tqdm.trange(0, len(test_dataset), batch_size): data = test_dataset[i] for k in tuple(data.keys()): if k not in ('input_ids', 'attention_mask'): del data[k] else: data[k] = data[k].unsqueeze(0) for j in range(i + 1, min(i + batch_size, len(test_dataset))): for k in data.keys(): data[k] = torch.cat((data[k], test_dataset[j][k].unsqueeze(0)), dim=0) y_true.extend([ agency_to_target[test_dataset.get_strings(j)['agency']] for j in range(i, min(i + batch_size, len(test_dataset)))]) data['input_ids'] = data['input_ids'].cuda() data['attention_mask'] = data['attention_mask'].cuda() output_ids = model.generate( **data, decoder_start_token_id=model.config.decoder.pad_token_id, min_length=7, max_length=20, num_beams=6 ) preds = [ tokenizer.decode(first_sent(x, tokenizer.sep_token_id), skip_special_tokens=True) for x in output_ids ] for title in preds: inp = tokenizer(title, add_special_tokens=True, max_length=max_tokens_title, padding='max_length', truncation=True ) logits = discriminator(input_ids=torch.LongTensor(inp['input_ids']).cuda().unsqueeze(0), attention_mask=torch.LongTensor(inp['attention_mask']).cuda().unsqueeze(0))[0] y_pred.append(torch.argmax(logits).item()) wandb.summary.update({ 'D-Style': classification_report(y_true, y_pred, output_dict=True) })
def perform_clustering_eval( existing_run_name: str, existing_run_id: str, config_file, eval_model_file, clustering_data_file, gold_markup_file, enable_bottleneck, text_to_vec_func ): logging.set_verbosity_info() init_wandb(existing_run_name, None, existing_run_id) config = json.loads(jsonnet_evaluate_file(config_file)) tokenizer_model_path = config["tokenizer_model_path"] tokenizer = BertTokenizer.from_pretrained(tokenizer_model_path, do_lower_case=False, do_basic_tokenize=False) max_tokens_text = config["max_tokens_text"] print("Loading model...") cls = BottleneckEncoderDecoderModel if enable_bottleneck else EncoderDecoderModel model = cls.from_pretrained(eval_model_file) model.eval() model.cuda() gold_markup = get_gold_markup(gold_markup_file) url2record, filename2url = get_data_to_cluster(clustering_data_file) setattr(tokenizer, 'max_tokens_text', max_tokens_text) text_to_vector_func = get_text_to_vector_func(text_to_vec_func, model, tokenizer) print('Calculating embeddings...') embeds = np.zeros((len(url2record.items()), 768)) total_articles = len(url2record.items()) for i, (url, record) in tqdm.tqdm(enumerate(url2record.items()), total=total_articles): text = record["title"] + ' ' + record["text"] text = text.lower().replace('\xa0', ' ').strip() embeds[i] = text_to_vector_func(text).detach().cpu().numpy().ravel() print('Embeds shape =', embeds.shape) print('Searching for optimal threshold') domain = np.logspace(-3, 0, 11) quals = [get_quality(embeds, gold_markup, url2record, dist) for dist in tqdm.tqdm(domain, total=11)] closer_domain = np.linspace( domain[max(0, np.argmax(quals) - 2)], domain[min(np.argmax(quals) + 3, len(domain) - 1)], 9) closer_quals = [get_quality(embeds, gold_markup, url2record, dist) for dist in tqdm.tqdm(closer_domain, total=9)] best_dist = closer_domain[np.argmax(closer_quals)] print('Best distance:', best_dist) get_quality(embeds, gold_markup, url2record, best_dist, print_result=True) log_to_wandb(embeds, gold_markup, best_dist, url2record, text_to_vec_func)
def make_inference_and_save( config_file, eval_model_file, test_file, test_sample_rate, enable_bottleneck, cluster_model_file, clustering_dist_threshold, out_path_prefix, dataset_type, style_model_eval, ): config = json.loads(jsonnet_evaluate_file(config_file)) tokenizer_model_path = config["tokenizer_model_path"] tokenizer = BertTokenizer.from_pretrained(tokenizer_model_path, do_lower_case=False, do_basic_tokenize=False) max_tokens_text = config["max_tokens_text"] max_tokens_title = config["max_tokens_title"] setattr(tokenizer, 'max_tokens_text', max_tokens_text) batch_size = config["batch_size"] print("Loading model...") cls = BottleneckEncoderDecoderModel if enable_bottleneck else EncoderDecoderModel model = cls.from_pretrained(eval_model_file) model.eval() model.cuda() if cluster_model_file: test_sample_rate = 1. filter_dates = ('2020-05-12', ) else: filter_dates = None if dataset_type == 'ria': print("Fetching RIA data...") test_records = [ r for r in tqdm.tqdm(ria_reader(test_file)) if random.random() <= test_sample_rate ] else: print("Fetching TG data...") test_records = [ r for r in tqdm.tqdm(tg_reader(test_file, filter_dates=filter_dates)) if random.random() <= test_sample_rate ] print("Building datasets...") if style_model_eval: agency_list = config['agency_list'] agency_to_special_token_id = { a: tokenizer.vocab[f'[unused{i+1}]'] for i, a in enumerate(agency_list) } test_dataset = AgencyTitleDatasetGeneration( test_records, tokenizer, filter_agencies=None, agency_to_special_token_id=agency_to_special_token_id, max_tokens_text=max_tokens_text, max_tokens_title=max_tokens_title) else: test_dataset = GenTitleDataset(test_records, tokenizer, max_tokens_text=max_tokens_text, max_tokens_title=max_tokens_title) print('Dataset size:', len(test_dataset)) if cluster_model_file: from utils.clustering_utils import get_text_to_vector_func clusterer = Clusterer( get_text_to_vector_func( 'bert-FirstCLS', BottleneckEncoderDecoderModel.from_pretrained( cluster_model_file), tokenizer), test_dataset, clustering_dist_threshold, dates=filter_dates, ) clusterer.perform_clustering() with open(out_path_prefix + 'prediction.txt', 'w', encoding='utf-8') as pf, \ open(out_path_prefix + 'gold.txt', 'w', encoding='utf-8') as gf: for i in tqdm.trange(0, len(test_dataset), batch_size): data = test_dataset[i] for k in tuple(data.keys()): if k not in ('input_ids', 'attention_mask'): del data[k] else: data[k] = data[k].unsqueeze(0) for j in range(i + 1, min(i + batch_size, len(test_dataset))): for k in data.keys(): data[k] = torch.cat( (data[k], test_dataset[j][k].unsqueeze(0)), dim=0) data['input_ids'] = data['input_ids'].cuda() data['attention_mask'] = data['attention_mask'].cuda() output_ids = model.generate( **data, decoder_start_token_id=model.config.decoder.pad_token_id, min_length=7, max_length=20, num_beams=6) preds = [ tokenizer.decode(first_sent(x, tokenizer.sep_token_id), skip_special_tokens=True) for x in output_ids ] for j in range(i, min(i + batch_size, len(test_dataset))): if cluster_model_file: refs = [] for r in clusterer.get_cluster_records(j): refs.append(r['title']) gf.write(' s_s '.join(refs) + '\n') else: gf.write(test_dataset.get_strings(j)['title'] + '\n') pf.write(preds[j - i] + '\n')
def distil_embeddings(config_file: str, train_file: str, val_file: str, train_sample_rate: float, val_sample_rate: float, input_model_path: str, output_model_path: str, random_seed: int, neptune_project: str, saved_embeddings: str): seed_everything(random_seed) train_file = get_true_file(train_file) val_file = get_true_file(val_file) assert train_file.endswith(".jsonl") assert val_file.endswith(".jsonl") config = json.loads(jsonnet_evaluate_file(config_file)) print("Fetching data...") train_records = [ r for r in parse_tg_jsonl(train_file) if random.random() <= train_sample_rate ] val_records = [ r for r in parse_tg_jsonl(val_file) if random.random() <= val_sample_rate ] tokenizer = AutoTokenizer.from_pretrained(input_model_path) max_tokens_count = config.get("max_tokens_count", 196) if not saved_embeddings or not os.path.isfile(saved_embeddings): print("Loading teacher model...") input_model = AutoModel.from_pretrained(input_model_path) print("Saving embeddings...") url2text = { r["url"]: r["text"] for r in itertools.chain(train_records, val_records) } urls = [] embeddings = [] batch_urls = [] batch_texts = [] batch_size = 8 for url, text in tqdm(url2text.items()): batch_urls.append(url) batch_texts.append(text) if len(batch_urls) == batch_size: urls.extend(batch_urls) batch_embeddings = calc_batch_embeddings( batch_texts, tokenizer, input_model, max_tokens_count) for embedding in batch_embeddings: embeddings.append(embedding) batch_urls = [] batch_texts = [] if batch_urls: urls.extend(batch_urls) batch_embeddings = calc_batch_embeddings(batch_texts, tokenizer, input_model, max_tokens_count) for embedding in batch_embeddings: embeddings.append(embedding) embeddings = torch.tensor(embeddings) data = {"urls": urls, "embeddings": embeddings} torch.save(data, saved_embeddings) else: print("Loading embeddings...") data = torch.load(saved_embeddings) url2num = {url: num for num, url in enumerate(data["urls"])} num2embedding = data["embeddings"] batch_size = config.get("batch_size", 32) num_workers = config.get("num_workers", 5) train_dataset = EmbeddingsAsTargetDataset(train_records, url2num, num2embedding, tokenizer, max_tokens_count) train_sampler = RandomSampler(train_dataset) train_loader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size, num_workers=num_workers) val_dataset = EmbeddingsAsTargetDataset(val_records, url2num, num2embedding, tokenizer, max_tokens_count) val_loader = DataLoader(val_dataset, batch_size=batch_size, num_workers=num_workers) patience = config.get("patience", 4) epochs = config.get("epochs", 5) gradient_clip_val = config.get("gradient_clip_val", 1.0) logger = False neptune_api_token = os.getenv("NEPTUNE_API_TOKEN") if neptune_project and neptune_api_token: params = copy.copy(config) params["train_sample_rate"] = train_sample_rate params["val_sample_rate"] = val_sample_rate params["train_file"] = train_file params["val_file"] = val_file logger = NeptuneLogger( api_key=neptune_api_token, project_name=neptune_project, experiment_name="Distil embeddings", tags=["training", "pytorch-lightning", "distil"], params=params) lightning_model = DistilEmbeddingBertLightning(config) early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=0.0, patience=patience, verbose=True, mode="min") trainer = Trainer(gpus=0, checkpoint_callback=False, accumulate_grad_batches=1, max_epochs=epochs, callbacks=[early_stop_callback], val_check_interval=1.0, gradient_clip_val=gradient_clip_val, deterministic=True, logger=logger) trainer.fit(lightning_model, train_loader, val_loader)