def main():
    logging.set_verbosity_info()
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_path', default='best_model_ckpt_0', type=str)
    parser.add_argument('--seed', default=202105, type=int)
    args = parser.parse_args()
    seed_random(args.seed)
    data_path = './user_data/duality_pair_pretrain_no_nsp.txt'
    vocab_path = './user_data/vocab.txt'
    model_path = './user_data/nezha-cn-base'
    output_path = './user_data/pretrained-nezha-base'

    tokenizer = BertTokenizer.from_pretrained(vocab_path)
    data = read_data(data_path, tokenizer)

    train_dataset = TcDataset(data)

    model = NeZhaForMaskedLM.from_pretrained(model_path)
    model.resize_token_embeddings(tokenizer.vocab_size)

    data_collator = TcCollator(max_seq_len=30,
                               tokenizer=tokenizer,
                               mlm_probability=0.15)

    logging_path = os.path.join(output_path, 'log')
    model_save_path = os.path.join(output_path, args.model_path)
    tokenizer_and_config = os.path.join(output_path, 'tokenizer_and_config')
    build_path(model_save_path)
    build_path(logging_path)
    build_path(tokenizer_and_config)

    training_args = TrainingArguments(output_dir=output_path,
                                      overwrite_output_dir=True,
                                      learning_rate=6e-5,
                                      num_train_epochs=130,
                                      per_device_train_batch_size=128,
                                      logging_steps=5000,
                                      fp16=True,
                                      fp16_backend='amp',
                                      load_best_model_at_end=True,
                                      prediction_loss_only=True,
                                      logging_dir=logging_path,
                                      logging_first_step=True,
                                      dataloader_num_workers=4,
                                      seed=2021)

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
    )

    trainer.train()
    trainer.save_model(model_save_path)
    tokenizer.save_pretrained(tokenizer_and_config)
def main():
    seed_everything(2021)

    logging.set_verbosity_info()

    corpus_path = './user_data/r2_corpus.txt'
    vocab_path = './user_data/r2_vocab_total.txt'
    model_path = './user_data/hfl-roberta-base'
    output_dir = './user_data/self-pretrained-bert-base-r2'

    tokenizer = BertTokenizer.from_pretrained(vocab_path)
    data = read_data(corpus_path, tokenizer)

    train_dataset = OppoDataset(data)

    model = BertForMaskedLM.from_pretrained(model_path)
    model.resize_token_embeddings(tokenizer.vocab_size)

    data_collator = Collator(max_seq_len=32,
                             tokenizer=tokenizer,
                             mlm_probability=0.15)

    logging_dir = os.path.join(output_dir, 'log')
    model_save_dir = os.path.join(output_dir, 'model_ckpt-1')
    tokenizer_and_config = os.path.join(output_dir, 'tokenizer_and_config')
    check_dir(model_save_dir)
    check_dir(logging_dir)
    check_dir(tokenizer_and_config)

    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=True,
        learning_rate=6e-5,
        num_train_epochs=130,
        # num_train_epochs=1,
        per_device_train_batch_size=64,
        logging_steps=2000,
        prediction_loss_only=True,
        load_best_model_at_end=True,
        logging_dir=logging_dir,
        logging_first_step=True,
        dataloader_num_workers=4,
        disable_tqdm=False,
        seed=2021)

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
    )

    trainer.train()

    trainer.save_model(model_save_dir)
    tokenizer.save_pretrained(tokenizer_and_config)
Example #3
0
    def test_set_level(self):
        logger = logging.get_logger()

        # the current default level is logging.WARNING
        level_origin = logging.get_verbosity()

        logging.set_verbosity_error()
        self.assertEqual(logger.getEffectiveLevel(), logging.get_verbosity())

        logging.set_verbosity_warning()
        self.assertEqual(logger.getEffectiveLevel(), logging.get_verbosity())

        logging.set_verbosity_info()
        self.assertEqual(logger.getEffectiveLevel(), logging.get_verbosity())

        logging.set_verbosity_debug()
        self.assertEqual(logger.getEffectiveLevel(), logging.get_verbosity())

        # restore to the original level
        logging.set_verbosity(level_origin)
Example #4
0
def evaluate_gen_title(existing_run_name: str,
                       existing_run_id: str,
                       config_file: str,
                       do_inference: bool,
                       eval_model_file: str,
                       test_file: str,
                       test_sample_rate: float,
                       out_dir: str,
                       dataset_type: str,
                       enable_bottleneck: bool = False,
                       cluster_model_file: str = None,
                       clustering_dist_threshold: float = 0.18,
                       style_model_eval: bool = False,
                       detokenize_after: bool = False,
                       tokenize_after: bool = False):
    logging.set_verbosity_info()
    init_wandb(existing_run_name, None, existing_run_id)

    out_path_prefix = os.path.join(
        out_dir, eval_model_file[eval_model_file.index('checkpoint'):])
    if out_path_prefix[-1] == '/':
        out_path_prefix = out_path_prefix[:-1]

    out_path_prefix += '-'

    if do_inference == '1':
        make_inference_and_save(config_file, eval_model_file, test_file,
                                test_sample_rate, enable_bottleneck,
                                cluster_model_file, clustering_dist_threshold,
                                out_path_prefix, dataset_type,
                                style_model_eval)

    evaluate_and_print_metrics(out_path_prefix + 'prediction.txt',
                               out_path_prefix + 'gold.txt',
                               detokenize_after=detokenize_after,
                               tokenize_after=tokenize_after,
                               is_multiple_ref=(cluster_model_file
                                                is not None),
                               lower=True,
                               are_clusters_used=(cluster_model_file
                                                  is not None))
Example #5
0
 def __init__(
     self,
     pretrained_model_name_or_path: str,
     log_info: bool = False,
     use_gpu: bool = False,
     do_lower_case: bool = False,
     do_basic_tokenize: bool = True,
     strip_accents: bool = True
 ):
     if log_info:
         logging.set_verbosity_info()
     self.tokenizer = AutoTokenizer.from_pretrained(
         pretrained_model_name_or_path,
         do_lower_case=do_lower_case,
         do_basic_tokenize=do_basic_tokenize,
         strip_accents=strip_accents
     )
     self.model = AutoModel.from_pretrained(pretrained_model_name_or_path)
     self.use_gpu = use_gpu and torch.cuda.is_available()
     if self.use_gpu:
         self.model.cuda()
     self.model.eval()
Example #6
0
from transformers import (
    ForCondGen,
    XLMForCondGen,
    logging,
)

from transformers_old.modeling_prophetnet import (
    ForCondGen as ForCondGenOld,
)
from transformers_old.modeling_xlm_prophetnet import (
    XLMForCondGen as XLMForCondGenOld,
)


logger = logging.get_logger(__name__)
logging.set_verbosity_info()


def to_pytorch(src_path, save_path):
    if "xprophetnet" in src_path:
        prophet_old = XLMForCondGenOld.from_pretrained(src_path)
        prophet, loading_info = XLMForCondGen.from_pretrained(src_path, output_loading_info=True)
    else:
        prophet_old = ForCondGenOld.from_pretrained(src_path)
        prophet, loading_info = ForCondGen.from_pretrained(src_path, output_loading_info=True)
    special_keys = ["key_proj", "value_proj", "query_proj"]
    mapping = {
        "self_attn": "ngram_self_attn",
        "cross_attn": "encoder_attn",
        "cross_attn_layer_norm": "encoder_attn_layer_norm",
        "feed_forward_layer_norm": "final_layer_norm",
Example #7
0
    lmap,
    pickle_save,
    save_git_info,
    save_json,
    set_extra_model_params,
    Seq2SeqDataset,
)

# need the parent dir module
sys.path.insert(2, str(Path(__file__).resolve().parents[1]))


logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

transformers_logging.set_verbosity_info()


class AttrDict(dict):
    def __init__(self, *args, **kwargs):
        super(AttrDict, self).__init__(*args, **kwargs)
        self.__dict__ = self


class GenerativeQAModule(BaseTransformer):
    mode = "generative_qa"
    loss_names = ["loss"]
    metric_names = ["em"]
    val_metric = "em"

    def __init__(self, hparams, **kwargs):
Example #8
0
def train_gen_title(run_name: str,
                    config_file: str,
                    train_file: str,
                    val_file: str,
                    dataset_type: str,
                    train_sample_rate: float,
                    val_sample_rate: float,
                    output_model_path: str,
                    enable_bottleneck: bool = False,
                    from_pretrained: str = None,
                    checkpoint: str = None):
    logging.set_verbosity_info()
    config = json.loads(jsonnet_evaluate_file(config_file))

    init_wandb(run_name, config)

    tokenizer_model_path = config["tokenizer_model_path"]
    tokenizer = BertTokenizer.from_pretrained(tokenizer_model_path,
                                              do_lower_case=False,
                                              do_basic_tokenize=False)

    max_tokens_text = config["max_tokens_text"]
    max_tokens_title = config["max_tokens_title"]

    print("Initializing model...")

    cls = BottleneckEncoderDecoderModel if enable_bottleneck else EncoderDecoderModel

    if from_pretrained:
        model = cls.from_pretrained(from_pretrained)
    else:
        enc_model_path = config["enc_model_path"]
        dec_model_path = config["dec_model_path"]
        model = cls.from_encoder_decoder_pretrained(enc_model_path,
                                                    dec_model_path)

    model.cuda()

    if dataset_type == 'ria':
        print("Fetching RIA data...")
        train_records = [
            r for r in tqdm.tqdm(ria_reader(train_file))
            if random.random() <= train_sample_rate
        ]
        val_records = [
            r for r in tqdm.tqdm(ria_reader(val_file))
            if random.random() <= val_sample_rate
        ]

        print("Building datasets...")

        train_dataset = GenTitleDataset(train_records,
                                        tokenizer,
                                        max_tokens_text=max_tokens_text,
                                        max_tokens_title=max_tokens_title)

        val_dataset = GenTitleDataset(val_records,
                                      tokenizer,
                                      max_tokens_text=max_tokens_text,
                                      max_tokens_title=max_tokens_title)
    elif dataset_type == 'tg':
        print("Fetching TG data...")
        all_records = [
            r for r in tqdm.tqdm(tg_reader(train_file))
            if random.random() <= train_sample_rate
        ]

        print("Building datasets...")

        full_dataset = GenTitleDataset(all_records,
                                       tokenizer,
                                       max_tokens_text=max_tokens_text,
                                       max_tokens_title=max_tokens_title)

        train_size = int(0.995 * len(full_dataset))
        train_dataset, val_dataset = torch.utils.data.random_split(
            full_dataset,
            [train_size, len(full_dataset) - train_size])
    elif dataset_type == 'lenta-ria':
        print('Fetching Lenta-RIA data...')
        lenta_records = [
            r for r in tqdm.tqdm(
                lenta_reader(
                    os.path.join(train_file, 'lenta/lenta-ru-news.train.csv')))
        ]
        lenta_records.extend([
            r for r in tqdm.tqdm(
                lenta_reader(
                    os.path.join(train_file, 'lenta/lenta-ru-news.val.csv')))
        ])

        ria_records = [
            r for r in tqdm.tqdm(
                ria_reader(
                    os.path.join(train_file, 'ria/ria.shuffled.train.json')))
        ]
        ria_records.extend([
            r for r in tqdm.tqdm(
                ria_reader(
                    os.path.join(train_file, 'ria/ria.shuffled.val.json')))
        ])

        records = [
            r for r in reader(
                '/home/aobuhtijarov/datasets/full_lenta_ria.test.jsonl')
        ]

        filter_lenta = [{
            'text': r['lenta_text'],
            'title': r['lenta_title'],
            'agency': 'lenta.ru',
            'date': r['lenta_date']
        } for r in records]

        filter_ria = [{
            'text': r['ria_text'],
            'title': r['ria_title'],
            'agency': 'РИА Новости',
            'date': r['lenta_date']
        } for r in records]

        lenta_filter_titles = set(x['title'] for x in filter_lenta)
        ria_filter_titles = set(x['title'] for x in filter_ria)
        lenta_records = [
            r for r in lenta_records if r['title'] not in lenta_filter_titles
        ]
        ria_records = [
            r for r in ria_records if r['title'] not in ria_filter_titles
        ]

        random.shuffle(ria_records)

        all_records = [r for r in lenta_records if r['date'][:4] in ['2010', '2011', '2012', '2013', '2014']] + \
            ria_records[:300000]

        random.shuffle(all_records)

        print("Building datasets...")

        full_dataset = GenTitleDataset(all_records,
                                       tokenizer,
                                       max_tokens_text=max_tokens_text,
                                       max_tokens_title=max_tokens_title)

        train_size = int(0.99 * len(full_dataset))
        train_dataset, val_dataset = torch.utils.data.random_split(
            full_dataset,
            [train_size, len(full_dataset) - train_size])
    elif dataset_type == 'clusters':
        with open(train_file, 'r') as f:
            records = [json.loads(x.strip()) for x in f.readlines()]

        lenta_records = [{
            'title': x['lenta_title'],
            'text': x['lenta_text']
        } for x in records]
        ria_records = [{
            'title': x['ria_title'],
            'text': x['ria_text']
        } for x in records]
        n1 = int(0.98 * len(lenta_records))
        n2 = int(0.98 * len(ria_records))
        train_records = lenta_records[:n1] + ria_records[:n2]
        val_records = lenta_records[n1:] + ria_records[n2:]

        train_dataset = GenTitleDataset(train_records,
                                        tokenizer,
                                        max_tokens_text=max_tokens_text,
                                        max_tokens_title=max_tokens_title)

        val_dataset = GenTitleDataset(val_records,
                                      tokenizer,
                                      max_tokens_text=max_tokens_text,
                                      max_tokens_title=max_tokens_title)
    elif dataset_type == 'baseline-ria':
        with open(train_file, 'r') as f:
            records = [json.loads(x.strip()) for x in f.readlines()]
        ria_records = [{
            'title': x['ria_title'],
            'text': x['ria_text']
        } for x in records]
        train_records = ria_records[:int(0.97 * len(ria_records))]
        val_records = ria_records[int(0.97 * len(ria_records)):]
        train_dataset = GenTitleDataset(train_records,
                                        tokenizer,
                                        max_tokens_text=max_tokens_text,
                                        max_tokens_title=max_tokens_title)

        val_dataset = GenTitleDataset(val_records,
                                      tokenizer,
                                      max_tokens_text=max_tokens_text,
                                      max_tokens_title=max_tokens_title)

    elif dataset_type == 'baseline-lenta':
        with open(train_file, 'r') as f:
            records = [json.loads(x.strip()) for x in f.readlines()]
        lenta_records = [{
            'title': x['lenta_title'],
            'text': x['lenta_text']
        } for x in records]
        train_records = lenta_records[:int(0.97 * len(lenta_records))]
        val_records = lenta_records[int(0.97 * len(lenta_records)):]
        train_dataset = GenTitleDataset(train_records,
                                        tokenizer,
                                        max_tokens_text=max_tokens_text,
                                        max_tokens_title=max_tokens_title)

        val_dataset = GenTitleDataset(val_records,
                                      tokenizer,
                                      max_tokens_text=max_tokens_text,
                                      max_tokens_title=max_tokens_title)

    wandb.summary.update({
        'Train dataset size': len(train_dataset),
        'Val dataset size': len(val_dataset)
    })

    print("Training model...")
    batch_size = config["batch_size"]
    eval_steps = config["eval_steps"]
    save_steps = config["save_steps"]
    logging_steps = config["logging_steps"]
    enc_lr = config["enc_lr"]
    dec_lr = config["dec_lr"]
    warmup_steps = config["num_warmup_steps"]
    max_steps = config["max_steps"]
    gradient_accumulation_steps = config["gradient_accumulation_steps"]

    opt = get_separate_lr_optimizer(model, enc_lr, dec_lr, warmup_steps,
                                    max_steps)

    training_args = TrainingArguments(
        output_dir=output_model_path,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        evaluation_strategy='steps',
        do_train=True,
        do_eval=True,
        overwrite_output_dir=False,
        logging_steps=logging_steps,
        save_steps=save_steps,
        eval_steps=eval_steps,
        save_total_limit=1,
        max_steps=max_steps,
        report_to='wandb',
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        optimizers=opt,
    )

    trainer.train(checkpoint)
    model.save_pretrained(output_model_path)
Example #9
0
def train_gen_title(
    config_file: str,
    train_file: str,
    val_file: str,
    train_sample_rate: float,
    val_sample_rate: float,
    output_model_path: str,
    enable_bottleneck: bool = False,
    from_pretrained: str = None,
    checkpoint: str = None
):
    train_file = get_true_file(train_file)
    val_file = get_true_file(val_file)
    assert train_file.endswith(".jsonl")
    assert val_file.endswith(".jsonl")
    logging.set_verbosity_info()

    config = json.loads(jsonnet_evaluate_file(config_file))

    print("Fetching data...")
    train_records = [r for r in read_tg_jsonl(train_file) if random.random() <= train_sample_rate]
    val_records = [r for r in read_tg_jsonl(val_file) if random.random() <= val_sample_rate]

    print("Building datasets...")
    model_path = config.pop("model_path")
    tokenizer = AutoTokenizer.from_pretrained(model_path, do_lower_case=False)

    max_tokens_text = config.pop("max_tokens_text", 196)
    max_tokens_title = config.pop("max_tokens_title", 48)

    train_dataset = GenTitleDataset(
        train_records,
        tokenizer,
        max_tokens_text=max_tokens_text,
        max_tokens_title=max_tokens_title)

    val_dataset = GenTitleDataset(
        val_records,
        tokenizer,
        max_tokens_text=max_tokens_text,
        max_tokens_title=max_tokens_title)

    print("Initializing model...")
    cls = BottleneckEncoderDecoderModel if enable_bottleneck else EncoderDecoderModel
    if from_pretrained:
        model = cls.from_pretrained(from_pretrained)
    else:
        model = cls.from_encoder_decoder_pretrained(model_path, model_path)

    print("Training model...")
    batch_size = config.pop("batch_size", 8)
    eval_steps = config.pop("eval_steps", 10000)
    save_steps = config.pop("save_steps", 10000)
    logging_steps = config.pop("logging_steps", 100)
    learning_rate = config.pop("learning_rate", 5e-05)
    warmup_steps = config.pop("warmup_steps", 2000)
    num_train_epochs = config.pop("num_train_epochs", 5)
    training_args = TrainingArguments(
        output_dir=output_model_path,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        evaluate_during_training=True,
        do_train=True,
        do_eval=True,
        overwrite_output_dir=False,
        logging_steps=logging_steps,
        save_steps=save_steps,
        eval_steps=eval_steps,
        learning_rate=learning_rate,
        warmup_steps=warmup_steps,
        save_total_limit=1,
        num_train_epochs=num_train_epochs
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
    )

    trainer.train(checkpoint)
    model.save_pretrained(output_model_path)
Example #10
0
def train_style_gen_title(
        run_name: str,
        config_file: str,
        train_file: str,
        dataset_type: str,
        output_model_path: str,
        from_pretrained: str = None,
        checkpoint: str = None
):
    logging.set_verbosity_info()
    config = json.loads(jsonnet_evaluate_file(config_file))
    init_wandb(run_name, config)

    agency_list = config['agency_list']
    print('Agency list:', agency_list)

    tokenizer_model_path = config["tokenizer_model_path"]
    tokenizer = BertTokenizer.from_pretrained(tokenizer_model_path, do_lower_case=False, do_basic_tokenize=False)

    max_tokens_text = config["max_tokens_text"]
    max_tokens_title = config["max_tokens_title"]

    print("Initializing model...")

    if from_pretrained:
        model = EncoderDecoderModel.from_pretrained(from_pretrained)
    else:
        enc_model_path = config["enc_model_path"]
        dec_model_path = config["dec_model_path"]
        model = EncoderDecoderModel.from_encoder_decoder_pretrained(enc_model_path, dec_model_path)

    print("Fetching data...")
    if dataset_type == 'tg':
        all_records = [r for r in tqdm.tqdm(tg_reader(train_file))]
    elif dataset_type == 'lenta-ria':
        lenta_records = [r for r in tqdm.tqdm(lenta_reader(os.path.join(train_file, 'lenta/lenta-ru-news.train.csv')))]
        lenta_records.extend(
            [r for r in tqdm.tqdm(lenta_reader(os.path.join(train_file, 'lenta/lenta-ru-news.val.csv')))]
        )

        ria_records = [r for r in tqdm.tqdm(ria_reader(os.path.join(train_file, 'ria/ria.shuffled.train.json')))]
        ria_records.extend(
            [r for r in tqdm.tqdm(ria_reader(os.path.join(train_file, 'ria/ria.shuffled.val.json')))]
        )

        random.shuffle(ria_records)

        all_records = [r for r in lenta_records if r['date'][:4] in ['2010', '2011', '2012', '2013', '2014']] + \
            ria_records[:220000]

        random.shuffle(all_records)

    print("Building datasets...")

    agency_to_special_token_id = {a: tokenizer.vocab[f'[unused{i+1}]'] for i, a in enumerate(agency_list)}

    full_dataset = AgencyTitleDatasetGeneration(
        all_records, tokenizer, 
        filter_agencies=list(agency_to_special_token_id.keys()), agency_to_special_token_id=agency_to_special_token_id,
        max_tokens_text=max_tokens_text, max_tokens_title=max_tokens_title
    )

    train_size = int(0.93 * len(full_dataset))
    train_dataset, val_dataset = torch.utils.data.random_split(full_dataset,
                                                               [train_size, len(full_dataset) - train_size])

    print(f"Train dataset length = {len(train_dataset)}\nVal dataset length = {len(val_dataset)}")
    wandb.summary.update({
        'Train dataset size': len(train_dataset),
        'Test dataset size': len(val_dataset),
    })

    print("Training model...")
    batch_size = config["batch_size"]
    eval_steps = config["eval_steps"]
    save_steps = config["save_steps"]
    logging_steps = config["logging_steps"]
    enc_lr = config["enc_lr"]
    dec_lr = config["dec_lr"]
    warmup_steps = config["num_warmup_steps"]
    max_steps = config["max_steps"]
    gradient_accumulation_steps = config["gradient_accumulation_steps"]

    opt = get_separate_lr_optimizer(model, enc_lr, dec_lr, warmup_steps, max_steps)

    training_args = TrainingArguments(
        output_dir=output_model_path,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        evaluation_strategy='steps',
        do_train=True,
        do_eval=True,
        overwrite_output_dir=False,
        logging_steps=logging_steps,
        save_steps=save_steps,
        eval_steps=eval_steps,
        save_total_limit=2,
        max_steps=max_steps,
        report_to='wandb',
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        optimizers=opt,
    )

    trainer.train(checkpoint)
    model.save_pretrained(output_model_path)
Example #11
0
parser.add_argument(
    "--eval_steps",
    type=int,
    default=500,
    help="If input should be tokenized to only lowercase",
)
parser.add_argument(
    "--do_lowercase",
    action="store_true",
    help="If input should be lowercase or not when tokenizing",
)

args = parser.parse_args()

hf_logging.enable_default_handler()
hf_logging.set_verbosity_info()
hf_logging.enable_explicit_format()

# Setup logging
tb_writer = SummaryWriter(log_dir=args.logging_dir)

logger = logging.getLogger("")
logger.setLevel(logging.INFO)

fh = logging.FileHandler(f"{args.logging_dir}.log")
sh = logging.StreamHandler(sys.stdout)
formatter = logging.Formatter(
    "[%(asctime)s], %(levelname)s %(message)s",
    datefmt="%a, %d %b %Y %H:%M:%S",
)
fh.setFormatter(formatter)
Example #12
0
def train_gen_title(run_name: str,
                    config_file: str,
                    train_file: str,
                    train_fraq: float,
                    output_model_path: str,
                    from_pretrained: str = None,
                    checkpoint: str = None):
    logging.set_verbosity_info()
    config = json.loads(jsonnet_evaluate_file(config_file))

    init_wandb(run_name, config)

    tokenizer_model_path = config["tokenizer_model_path"]
    tokenizer = BertTokenizer.from_pretrained(tokenizer_model_path,
                                              do_lower_case=False,
                                              do_basic_tokenize=False)

    max_tokens_text = config["max_tokens_text"]
    max_tokens_title = config["max_tokens_title"]

    full_dataset = LentaRiaDataset(train_file, tokenizer, max_tokens_text,
                                   max_tokens_title)

    print("Initializing model...")
    if from_pretrained:
        model = EncoderDecoderModel.from_pretrained(from_pretrained)
    else:
        enc_model_path = config["enc_model_path"]
        dec_model_path = config["dec_model_path"]
        model = EncoderDecoderModel.from_encoder_decoder_pretrained(
            enc_model_path, dec_model_path)

    train_size = int(train_fraq * len(full_dataset))

    train_dataset, val_dataset = \
            torch.utils.data.random_split(full_dataset, [train_size, len(full_dataset) - train_size])

    wandb.summary.update({
        'Train dataset size': len(train_dataset),
        'Val dataset size': len(val_dataset),
    })

    print("Training model...")
    batch_size = config["batch_size"]
    eval_steps = config["eval_steps"]
    save_steps = config["save_steps"]
    logging_steps = config["logging_steps"]
    enc_lr = config["enc_lr"]
    dec_lr = config["dec_lr"]
    warmup_steps = config["num_warmup_steps"]
    max_steps = config["max_steps"]
    gradient_accumulation_steps = config["gradient_accumulation_steps"]

    opt = get_separate_lr_optimizer(model, enc_lr, dec_lr, warmup_steps,
                                    max_steps)

    training_args = TrainingArguments(
        output_dir=output_model_path,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        evaluation_strategy='steps',
        do_train=True,
        do_eval=True,
        overwrite_output_dir=False,
        logging_steps=logging_steps,
        save_steps=save_steps,
        eval_steps=eval_steps,
        save_total_limit=1,
        max_steps=max_steps,
        report_to='wandb',
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        optimizers=opt,
    )

    trainer.train(checkpoint)
    model.save_pretrained(output_model_path)
Example #13
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    hf_logging.set_verbosity_info()

    if data_args.eval_data_file is None and training_args.do_eval:
        raise ValueError(
            "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
            "or remove the --do_eval argument.")

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        logger.info(
            f"Output dir ({training_args.output_dir}) is not empty, will try to reload from there."
        )
        model_args.model_name_or_path = training_args.output_dir
        # raise ValueError(
        #     f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        # )

    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    if model_args.config_name:
        config = AutoConfig.from_pretrained(model_args.config_name,
                                            cache_dir=model_args.cache_dir)
    elif model_args.model_name_or_path:
        config = AutoConfig.from_pretrained(model_args.model_name_or_path,
                                            cache_dir=model_args.cache_dir)
    else:
        config = CONFIG_MAPPING[model_args.model_type]()
        logger.warning(
            "You are instantiating a new config instance from scratch.")

    if model_args.tokenizer_name:
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.tokenizer_name, cache_dir=model_args.cache_dir)
    elif model_args.model_name_or_path:
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.model_name_or_path, cache_dir=model_args.cache_dir)
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it,"
            "and load it from here, using --tokenizer_name")

    if model_args.model_name_or_path:
        model = AutoModelWithLMHead.from_pretrained(
            model_args.model_name_or_path,
            from_tf=bool(".ckpt" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
        )
    else:
        logger.info("Training new model from scratch")
        model = AutoModelWithLMHead.from_config(config)

    logger.info(model)
    num_params = sum(p.numel() for p in model.parameters())
    logger.info('Model has %d parameters' % num_params)
    num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    logger.info('Model has %d trainable parameters' % num_params)

    # ADD special tokens
    tokenizer.pad_token = tokenizer.eos_token
    special_tokens_dict = {
        'additional_special_tokens':
        ['<STORY>', '<QUERY>', '<PROOF>', '<ANSWER>']
    }
    # NOTE: should also have added "ent_1", "ent_2", ..., "ent_20" :/
    num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
    logger.info(f'We have added {num_added_toks} tokens')
    '''
    if tokenizer.pad_token_id is None and data_args.line_by_line:
        # See PR 3388. Some tokenizers don't had pad tokens which causes errors at the encoding step in the collate_fn.
        # We give here the option to force the addition of a pad token. The attention mask is used to ignore this token
        # when feeding to the model.
        # tokenizer.pad_token = tokenizer.eos_token
        num_added_toks = tokenizer.add_special_tokens({"pad_token": "<pad>"})
    '''

    model.resize_token_embeddings(len(tokenizer))

    if config.model_type in ["bert", "roberta", "distilbert", "camembert"
                             ] and not data_args.mlm:
        raise ValueError(
            "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the"
            "--mlm flag (masked language modeling).")

    if data_args.block_size <= 0:
        data_args.block_size = tokenizer.model_max_length
        # Our input block size will be the max possible for the model
    else:
        data_args.block_size = min(data_args.block_size,
                                   tokenizer.model_max_length)

    # Get datasets
    train_dataset = (get_dataset(
        data_args, tokenizer=tokenizer, cache_dir=model_args.cache_dir)
                     if training_args.do_train else None)
    eval_dataset = (get_dataset(data_args,
                                tokenizer=tokenizer,
                                evaluate=True,
                                cache_dir=model_args.cache_dir) if
                    (training_args.do_eval
                     or training_args.evaluate_during_training) else None)
    if config.model_type == "xlnet":
        data_collator = DataCollatorForPermutationLanguageModeling(
            tokenizer=tokenizer,
            plm_probability=data_args.plm_probability,
            max_span_length=data_args.max_span_length,
        )
    else:
        data_collator = DataCollatorForLanguageModeling(
            tokenizer=tokenizer,
            mlm=data_args.mlm,
            mlm_probability=data_args.mlm_probability)

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
    )

    # start by saving tokenizer so that we can restart training!
    # if trainer.is_world_master():
    #     tokenizer.save_pretrained(training_args.output_dir)

    results = {}
    # Training
    if training_args.do_train:
        model_path = (model_args.model_name_or_path
                      if model_args.model_name_or_path is not None
                      and os.path.isdir(model_args.model_name_or_path) else
                      None)
        logger.info(f"model_path: {model_path}")
        if model_path is not None:
            # Grab the most recent checkpoint
            checkpoints_sorted = trainer._sorted_checkpoints(use_mtime=True)
            assert len(checkpoints_sorted) > 0
            checkpoint_most_recent = checkpoints_sorted[-1]
            logger.info(
                f"most recent checkpoint: {checkpoint_most_recent}. setting model_path to this."
            )
            # TODO: find a way to set:
            # - patience_best_eval_loss = None
            # - patience_evals_without_improvement = 0
            # - patience_should_stop = False
            model_path = checkpoint_most_recent
        train_results = trainer.train(model_path=model_path, )
        results["train_step"] = train_results.global_step
        results["train_loss"] = train_results.training_loss
        results["train_ppl"] = math.exp(train_results.training_loss)

        # trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        # if trainer.is_world_master():
        #     tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        eval_output = trainer.evaluate()
        results["valid_loss"] = eval_output["eval_loss"]
        results["valid_ppl"] = math.exp(eval_output["eval_loss"])

    output_eval_file = os.path.join(training_args.output_dir, "results_lm.txt")
    if trainer.is_world_master():
        with open(output_eval_file, "w") as writer:
            logger.info("***** results *****")
            for key in sorted(results.keys()):
                logger.info("  %s = %s", key, str(results[key]))
                writer.write("%s = %s\n" % (key, str(results[key])))

    return results
Example #14
0
def train_discriminator(
    run_name: str,
    model_path: str,
    config_file: str,
    train_file: str,
    train_fraq: float,
    dataset_type: str,
    output_model_path: str,
):
    logging.set_verbosity_info()
    config = json.loads(jsonnet_evaluate_file(config_file))
    init_wandb(run_name, config)

    agency_list = config['agency_list']
    print('Agency list:', agency_list)

    max_tokens_text = config["max_tokens_text"]
    max_tokens_title = config["max_tokens_title"]

    tokenizer_model_path = config["tokenizer_model_path"]
    tokenizer = BertTokenizer.from_pretrained(tokenizer_model_path,
                                              do_lower_case=False,
                                              do_basic_tokenize=False)

    print("Fetching data...")
    if dataset_type == 'tg':
        all_records = [
            r for r in tqdm.tqdm(tg_reader(train_file, agency_list))
        ]
        full_dataset = AgencyTitleDatasetClassification(
            all_records,
            tokenizer,
            agency_list,
            max_tokens_text=max_tokens_text,
            max_tokens_title=max_tokens_title)
    elif dataset_type == 'lenta-ria':
        lenta_records = [
            r for r in tqdm.tqdm(
                lenta_reader(
                    os.path.join(train_file, 'lenta/lenta-ru-news.train.csv')))
        ]
        lenta_records.extend([
            r for r in tqdm.tqdm(
                lenta_reader(
                    os.path.join(train_file, 'lenta/lenta-ru-news.val.csv')))
        ])

        ria_records = [
            r for r in tqdm.tqdm(
                ria_reader(
                    os.path.join(train_file, 'ria/ria.shuffled.train.json')))
        ]
        ria_records.extend([
            r for r in tqdm.tqdm(
                ria_reader(
                    os.path.join(train_file, 'ria/ria.shuffled.val.json')))
        ])

        records = [
            r for r in reader(
                '/home/aobuhtijarov/datasets/full_lenta_ria.test.jsonl')
        ]

        filter_lenta = [{
            'text': r['lenta_text'],
            'title': r['lenta_title'],
            'agency': 'lenta.ru',
            'date': r['lenta_date']
        } for r in records]

        filter_ria = [{
            'text': r['ria_text'],
            'title': r['ria_title'],
            'agency': 'РИА Новости',
            'date': r['lenta_date']
        } for r in records]

        lenta_filter_titles = set(x['title'] for x in filter_lenta)
        ria_filter_titles = set(x['title'] for x in filter_ria)
        lenta_records = [
            r for r in lenta_records if r['title'] not in lenta_filter_titles
        ]
        ria_records = [
            r for r in ria_records if r['title'] not in ria_filter_titles
        ]

        random.shuffle(ria_records)
        lenta_records = [
            r for r in lenta_records
            if r['date'][:4] in ['2010', '2011', '2012', '2013', '2014']
        ]

        all_records = lenta_records + ria_records[:len(lenta_records)]

        random.shuffle(all_records)
        full_dataset = AgencyTitleDatasetClassification(
            all_records,
            tokenizer,
            agency_list,
            max_tokens_text=max_tokens_text,
            max_tokens_title=max_tokens_title)
    elif dataset_type == 'lenta-ria-clusters':
        full_dataset = LentaRiaDatasetClassification(train_file, tokenizer,
                                                     agency_list,
                                                     max_tokens_text,
                                                     max_tokens_title)

    print("Building datasets...")

    train_size = int(train_fraq * len(full_dataset))
    test_size = int((1 - train_fraq) * 0.5 * len(full_dataset))

    train_dataset, test_dataset, eval_dataset = \
        torch.utils.data.random_split(full_dataset, [train_size, test_size, len(full_dataset) - train_size - test_size])

    wandb.summary.update({
        'Train dataset size': len(train_dataset),
        'Val dataset size': len(eval_dataset),
        'Test dataset size': len(test_dataset),
    })

    print("Initializing model...")
    model = AutoModelForSequenceClassification.from_pretrained(
        model_path, num_labels=len(agency_list))

    print("Training model...")
    batch_size = config["batch_size"]
    logging_steps = config["logging_steps"]
    save_steps = config["save_steps"]
    eval_steps = config["eval_steps"]
    warmup_steps = config["num_warmup_steps"]
    gradient_accumulation_steps = config["gradient_accumulation_steps"]
    max_steps = config["max_steps"]
    lr = config["learning_rate"]

    training_args = TrainingArguments(
        output_dir=output_model_path,
        do_train=True,
        do_eval=True,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        evaluation_strategy='steps',
        learning_rate=lr,
        warmup_steps=warmup_steps,
        overwrite_output_dir=False,
        logging_steps=logging_steps,
        eval_steps=eval_steps,
        save_steps=save_steps,
        max_steps=max_steps,
        save_total_limit=1,
        weight_decay=0.01,
        report_to='wandb',
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    wandb.summary.update(
        {'Test Evaluation': trainer.evaluate(eval_dataset=test_dataset)})
    model.save_pretrained(output_model_path)
Example #15
0
def evaluate_style_gen_title(
    existing_run_name: str,
    existing_run_id: str,
    config_file: str,
    gen_model_file: str,
    discr_model_file: str,
    test_file: str,
    test_sample_rate: float,
):
    logging.set_verbosity_info()
    init_wandb(existing_run_name, None, existing_run_id)

    config = json.loads(jsonnet_evaluate_file(config_file))

    tokenizer_model_path = config["tokenizer_model_path"]
    tokenizer = BertTokenizer.from_pretrained(tokenizer_model_path, do_lower_case=False, do_basic_tokenize=False)

    max_tokens_text = config["max_tokens_text"]
    max_tokens_title = config["max_tokens_title"]
    setattr(tokenizer, 'max_tokens_text', max_tokens_text)

    batch_size = config["batch_size"]

    print("Loading model...")
    model = EncoderDecoderModel.from_pretrained(gen_model_file)
    model.eval()
    model.cuda()

    agency_list = config['agency_list']
    discriminator = AutoModelForSequenceClassification.from_pretrained(discr_model_file, num_labels=len(agency_list)).cuda()
    
    print("Fetching TG data...")
    test_records = [r for r in tqdm.tqdm(tg_reader(test_file)) 
        if random.random() <= test_sample_rate]
    
    print("Building datasets...")
    
    
    agency_to_special_token_id = {
        a: tokenizer.vocab[f'[unused{i+1}]'] for i, a in enumerate(agency_list)
    }

    agency_to_target = {a: i for i, a in enumerate(sorted(agency_list))}

    test_dataset = AgencyTitleDatasetGeneration(
        test_records, tokenizer,
        filter_agencies=list(agency_to_special_token_id.keys()),
        agency_to_special_token_id=agency_to_special_token_id,
        max_tokens_text=max_tokens_text, max_tokens_title=max_tokens_title
    )

    print('Dataset size:', len(test_dataset))

    y_pred = []
    y_true = []

    for i in tqdm.trange(0, len(test_dataset), batch_size):
        data = test_dataset[i]
        for k in tuple(data.keys()):
            if k not in ('input_ids', 'attention_mask'):
                del data[k]
            else:
                data[k] = data[k].unsqueeze(0)

        for j in range(i + 1, min(i + batch_size, len(test_dataset))):
            for k in data.keys():
                data[k] = torch.cat((data[k], test_dataset[j][k].unsqueeze(0)), dim=0)

        y_true.extend([ agency_to_target[test_dataset.get_strings(j)['agency']]
            for j in range(i, min(i + batch_size, len(test_dataset)))])

        data['input_ids'] = data['input_ids'].cuda()
        data['attention_mask'] = data['attention_mask'].cuda()

        output_ids = model.generate(
            **data,
            decoder_start_token_id=model.config.decoder.pad_token_id,
            min_length=7,
            max_length=20,
            num_beams=6
        )

        preds = [
            tokenizer.decode(first_sent(x, tokenizer.sep_token_id), skip_special_tokens=True) for x in output_ids
        ]

        for title in preds:
            inp = tokenizer(title, 
                add_special_tokens=True, max_length=max_tokens_title,
                padding='max_length', truncation=True
            )

            logits = discriminator(input_ids=torch.LongTensor(inp['input_ids']).cuda().unsqueeze(0), 
                                   attention_mask=torch.LongTensor(inp['attention_mask']).cuda().unsqueeze(0))[0]
            y_pred.append(torch.argmax(logits).item())

    wandb.summary.update({
        'D-Style': classification_report(y_true, y_pred, output_dict=True)
    })
Example #16
0
def perform_clustering_eval(
    existing_run_name: str,
    existing_run_id: str,
    config_file,
    eval_model_file,
    clustering_data_file,
    gold_markup_file,
    enable_bottleneck,
    text_to_vec_func
):
    logging.set_verbosity_info()
    init_wandb(existing_run_name, None, existing_run_id)

    config = json.loads(jsonnet_evaluate_file(config_file))

    tokenizer_model_path = config["tokenizer_model_path"]
    tokenizer = BertTokenizer.from_pretrained(tokenizer_model_path, do_lower_case=False, do_basic_tokenize=False)

    max_tokens_text = config["max_tokens_text"]

    print("Loading model...")
    cls = BottleneckEncoderDecoderModel if enable_bottleneck else EncoderDecoderModel
    model = cls.from_pretrained(eval_model_file)
    model.eval()
    model.cuda()

    gold_markup = get_gold_markup(gold_markup_file)

    url2record, filename2url = get_data_to_cluster(clustering_data_file)
    setattr(tokenizer, 'max_tokens_text', max_tokens_text)
    text_to_vector_func = get_text_to_vector_func(text_to_vec_func, model, tokenizer)

    print('Calculating embeddings...')
    embeds = np.zeros((len(url2record.items()), 768))

    total_articles = len(url2record.items())

    for i, (url, record) in tqdm.tqdm(enumerate(url2record.items()), total=total_articles):
        text = record["title"] + ' ' + record["text"]
        text = text.lower().replace('\xa0', ' ').strip()
        embeds[i] = text_to_vector_func(text).detach().cpu().numpy().ravel()

    print('Embeds shape =', embeds.shape)

    print('Searching for optimal threshold')
    domain = np.logspace(-3, 0, 11)
    quals = [get_quality(embeds, gold_markup, url2record, dist)
            for dist in tqdm.tqdm(domain, total=11)]

    closer_domain = np.linspace(
        domain[max(0, np.argmax(quals) - 2)],
        domain[min(np.argmax(quals) + 3, len(domain) - 1)], 
        9)
    closer_quals = [get_quality(embeds, gold_markup, url2record, dist)
                    for dist in tqdm.tqdm(closer_domain, total=9)]

    best_dist = closer_domain[np.argmax(closer_quals)]
    print('Best distance:', best_dist)

    get_quality(embeds, gold_markup, url2record, best_dist, print_result=True)
    log_to_wandb(embeds, gold_markup, best_dist, url2record, text_to_vec_func)