Esempio n. 1
0
 def __init__(self, db_session, config_path: str):
     self.db_session = db_session
     self.config = json.loads(jsonnet_evaluate_file(config_path))
     self.processors = dict()
     for key, item in self.config["processors"].items():
         item_type = item.pop("type")
         self.processors[key] = Processor.by_name(item_type)(**item)
         print("'{}' processor loaded".format(key))
Esempio n. 2
0
 def __init__(self, config_path):
     self.config = json.loads(jsonnet_evaluate_file(config_path))
     self.lang_detect_model_path = self.config["lang_detect_model_path"]
     self.cat_detect_model_path = self.config["cat_detect_model_path"]
     self.max_tokens = self.config.get("max_tokens")
     self.is_lower = self.config["is_lower"]
     self.languages = self.config.get("languages", ["ru", "en"])
     self.is_news_only = self.config.get("is_news_only", False)
     assert os.path.exists(
         self.lang_detect_model_path), "No language detection model found"
     assert os.path.exists(
         self.cat_detect_model_path), "No category detection model found"
     self.lang_detect_model = ft_load_model(self.lang_detect_model_path)
     self.cat_detect_model = ft_load_model(self.cat_detect_model_path)
     self.tokenizer = Tokenizer("conservative", joiner_annotate=False)
Esempio n. 3
0
    def __init__(self, db_session, config_path: str):
        self.config = json.loads(jsonnet_evaluate_file(config_path))
        self.db_session = db_session
        self.vectors = None

        self.num2doc = list()
        self.num2entities = list()
        self.num2keywords = list()
        self.num2host = list()
        self.num2timestamp = list()
        self.doc_count = 0

        self.id2num = dict()
        self.keyword2nums = defaultdict(list)

        self.distances = None

        self.labels = dict()
        self.clusters = defaultdict(list)
Esempio n. 4
0
def train_tfidf(config_file, input_file, output_file, svd_matrix_file):
    config = json.loads(jsonnet_evaluate_file(config_file))
    input_file = get_true_file(input_file)
    assert input_file.endswith(".jsonl")

    print("Parsing input data...")
    corpus = []
    for record in tqdm(read_tg_jsonl(input_file)):
        corpus.append(record.pop("title") + " " + record.pop("text"))

    idfs = build_idf_vocabulary(corpus, **config.pop("building"))

    print("Saving vocabulary with IDFs...")
    with open(output_file, "w") as w:
        for word, idf in idfs:
            w.write("{}\t{}\n".format(word, idf))

    word2idf = {word: idf for word, idf in idfs}
    word2idx = {word: idx for idx, (word, _) in enumerate(idfs)}

    print("Preparing CSR martix...")
    X_data = []
    X_col_ind = []
    X_row_ind = []
    for i, text in enumerate(corpus):
        data, col_ind = get_tfidf_vector(text, word2idf, word2idx)
        row_ind = [i for _ in range(len(col_ind))]
        X_data += data
        X_col_ind += col_ind
        X_row_ind += row_ind
    X = csr_matrix((X_data, (X_row_ind, X_col_ind)))

    print("Calculating truncated SVD...")
    svd_dim = config.pop("svd_dim")
    svd = TruncatedSVD(n_components=svd_dim, n_iter=100, random_state=42)
    svd.fit(X)
    matrix = svd.components_.T
    model = SVDEmbedder(len(word2idf), svd_dim)
    model.mapping_layer.weight.data = torch.DoubleTensor(matrix).transpose(
        0, 1)
    torch.save(model, svd_matrix_file)
Esempio n. 5
0
import wandb

# In[ ]:

from _jsonnet import evaluate_file as jsonnet_evaluate_file
from transformers import BertTokenizer, EncoderDecoderModel, Trainer, TrainingArguments, logging

from readers import tg_reader, lenta_reader, ria_reader
from custom_datasets import FullStyleDataset
from utils.training_utils import get_separate_lr_optimizer, init_wandb

# In[ ]:

logging.set_verbosity_info()
config = json.loads(
    jsonnet_evaluate_file(
        '/home/aobuhtijarov/master-thesis/configs/gen_title.jsonnet'))
init_wandb('full-style', config)

agency_list = ["РИА Новости", "lenta.ru"]
print('Agency list:', agency_list)

# In[ ]:

tokenizer_model_path = config["tokenizer_model_path"]
tokenizer = BertTokenizer.from_pretrained(tokenizer_model_path,
                                          do_lower_case=False,
                                          do_basic_tokenize=False)

max_tokens_text = config["max_tokens_text"]
max_tokens_title = config["max_tokens_title"]
Esempio n. 6
0
def train_gen_title(run_name: str,
                    config_file: str,
                    train_file: str,
                    val_file: str,
                    dataset_type: str,
                    train_sample_rate: float,
                    val_sample_rate: float,
                    output_model_path: str,
                    enable_bottleneck: bool = False,
                    from_pretrained: str = None,
                    checkpoint: str = None):
    logging.set_verbosity_info()
    config = json.loads(jsonnet_evaluate_file(config_file))

    init_wandb(run_name, config)

    tokenizer_model_path = config["tokenizer_model_path"]
    tokenizer = BertTokenizer.from_pretrained(tokenizer_model_path,
                                              do_lower_case=False,
                                              do_basic_tokenize=False)

    max_tokens_text = config["max_tokens_text"]
    max_tokens_title = config["max_tokens_title"]

    print("Initializing model...")

    cls = BottleneckEncoderDecoderModel if enable_bottleneck else EncoderDecoderModel

    if from_pretrained:
        model = cls.from_pretrained(from_pretrained)
    else:
        enc_model_path = config["enc_model_path"]
        dec_model_path = config["dec_model_path"]
        model = cls.from_encoder_decoder_pretrained(enc_model_path,
                                                    dec_model_path)

    model.cuda()

    if dataset_type == 'ria':
        print("Fetching RIA data...")
        train_records = [
            r for r in tqdm.tqdm(ria_reader(train_file))
            if random.random() <= train_sample_rate
        ]
        val_records = [
            r for r in tqdm.tqdm(ria_reader(val_file))
            if random.random() <= val_sample_rate
        ]

        print("Building datasets...")

        train_dataset = GenTitleDataset(train_records,
                                        tokenizer,
                                        max_tokens_text=max_tokens_text,
                                        max_tokens_title=max_tokens_title)

        val_dataset = GenTitleDataset(val_records,
                                      tokenizer,
                                      max_tokens_text=max_tokens_text,
                                      max_tokens_title=max_tokens_title)
    elif dataset_type == 'tg':
        print("Fetching TG data...")
        all_records = [
            r for r in tqdm.tqdm(tg_reader(train_file))
            if random.random() <= train_sample_rate
        ]

        print("Building datasets...")

        full_dataset = GenTitleDataset(all_records,
                                       tokenizer,
                                       max_tokens_text=max_tokens_text,
                                       max_tokens_title=max_tokens_title)

        train_size = int(0.995 * len(full_dataset))
        train_dataset, val_dataset = torch.utils.data.random_split(
            full_dataset,
            [train_size, len(full_dataset) - train_size])
    elif dataset_type == 'lenta-ria':
        print('Fetching Lenta-RIA data...')
        lenta_records = [
            r for r in tqdm.tqdm(
                lenta_reader(
                    os.path.join(train_file, 'lenta/lenta-ru-news.train.csv')))
        ]
        lenta_records.extend([
            r for r in tqdm.tqdm(
                lenta_reader(
                    os.path.join(train_file, 'lenta/lenta-ru-news.val.csv')))
        ])

        ria_records = [
            r for r in tqdm.tqdm(
                ria_reader(
                    os.path.join(train_file, 'ria/ria.shuffled.train.json')))
        ]
        ria_records.extend([
            r for r in tqdm.tqdm(
                ria_reader(
                    os.path.join(train_file, 'ria/ria.shuffled.val.json')))
        ])

        records = [
            r for r in reader(
                '/home/aobuhtijarov/datasets/full_lenta_ria.test.jsonl')
        ]

        filter_lenta = [{
            'text': r['lenta_text'],
            'title': r['lenta_title'],
            'agency': 'lenta.ru',
            'date': r['lenta_date']
        } for r in records]

        filter_ria = [{
            'text': r['ria_text'],
            'title': r['ria_title'],
            'agency': 'РИА Новости',
            'date': r['lenta_date']
        } for r in records]

        lenta_filter_titles = set(x['title'] for x in filter_lenta)
        ria_filter_titles = set(x['title'] for x in filter_ria)
        lenta_records = [
            r for r in lenta_records if r['title'] not in lenta_filter_titles
        ]
        ria_records = [
            r for r in ria_records if r['title'] not in ria_filter_titles
        ]

        random.shuffle(ria_records)

        all_records = [r for r in lenta_records if r['date'][:4] in ['2010', '2011', '2012', '2013', '2014']] + \
            ria_records[:300000]

        random.shuffle(all_records)

        print("Building datasets...")

        full_dataset = GenTitleDataset(all_records,
                                       tokenizer,
                                       max_tokens_text=max_tokens_text,
                                       max_tokens_title=max_tokens_title)

        train_size = int(0.99 * len(full_dataset))
        train_dataset, val_dataset = torch.utils.data.random_split(
            full_dataset,
            [train_size, len(full_dataset) - train_size])
    elif dataset_type == 'clusters':
        with open(train_file, 'r') as f:
            records = [json.loads(x.strip()) for x in f.readlines()]

        lenta_records = [{
            'title': x['lenta_title'],
            'text': x['lenta_text']
        } for x in records]
        ria_records = [{
            'title': x['ria_title'],
            'text': x['ria_text']
        } for x in records]
        n1 = int(0.98 * len(lenta_records))
        n2 = int(0.98 * len(ria_records))
        train_records = lenta_records[:n1] + ria_records[:n2]
        val_records = lenta_records[n1:] + ria_records[n2:]

        train_dataset = GenTitleDataset(train_records,
                                        tokenizer,
                                        max_tokens_text=max_tokens_text,
                                        max_tokens_title=max_tokens_title)

        val_dataset = GenTitleDataset(val_records,
                                      tokenizer,
                                      max_tokens_text=max_tokens_text,
                                      max_tokens_title=max_tokens_title)
    elif dataset_type == 'baseline-ria':
        with open(train_file, 'r') as f:
            records = [json.loads(x.strip()) for x in f.readlines()]
        ria_records = [{
            'title': x['ria_title'],
            'text': x['ria_text']
        } for x in records]
        train_records = ria_records[:int(0.97 * len(ria_records))]
        val_records = ria_records[int(0.97 * len(ria_records)):]
        train_dataset = GenTitleDataset(train_records,
                                        tokenizer,
                                        max_tokens_text=max_tokens_text,
                                        max_tokens_title=max_tokens_title)

        val_dataset = GenTitleDataset(val_records,
                                      tokenizer,
                                      max_tokens_text=max_tokens_text,
                                      max_tokens_title=max_tokens_title)

    elif dataset_type == 'baseline-lenta':
        with open(train_file, 'r') as f:
            records = [json.loads(x.strip()) for x in f.readlines()]
        lenta_records = [{
            'title': x['lenta_title'],
            'text': x['lenta_text']
        } for x in records]
        train_records = lenta_records[:int(0.97 * len(lenta_records))]
        val_records = lenta_records[int(0.97 * len(lenta_records)):]
        train_dataset = GenTitleDataset(train_records,
                                        tokenizer,
                                        max_tokens_text=max_tokens_text,
                                        max_tokens_title=max_tokens_title)

        val_dataset = GenTitleDataset(val_records,
                                      tokenizer,
                                      max_tokens_text=max_tokens_text,
                                      max_tokens_title=max_tokens_title)

    wandb.summary.update({
        'Train dataset size': len(train_dataset),
        'Val dataset size': len(val_dataset)
    })

    print("Training model...")
    batch_size = config["batch_size"]
    eval_steps = config["eval_steps"]
    save_steps = config["save_steps"]
    logging_steps = config["logging_steps"]
    enc_lr = config["enc_lr"]
    dec_lr = config["dec_lr"]
    warmup_steps = config["num_warmup_steps"]
    max_steps = config["max_steps"]
    gradient_accumulation_steps = config["gradient_accumulation_steps"]

    opt = get_separate_lr_optimizer(model, enc_lr, dec_lr, warmup_steps,
                                    max_steps)

    training_args = TrainingArguments(
        output_dir=output_model_path,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        evaluation_strategy='steps',
        do_train=True,
        do_eval=True,
        overwrite_output_dir=False,
        logging_steps=logging_steps,
        save_steps=save_steps,
        eval_steps=eval_steps,
        save_total_limit=1,
        max_steps=max_steps,
        report_to='wandb',
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        optimizers=opt,
    )

    trainer.train(checkpoint)
    model.save_pretrained(output_model_path)
Esempio n. 7
0
def train_gen_title(
    config_file: str,
    train_file: str,
    val_file: str,
    train_sample_rate: float,
    val_sample_rate: float,
    output_model_path: str,
    enable_bottleneck: bool = False,
    from_pretrained: str = None,
    checkpoint: str = None
):
    train_file = get_true_file(train_file)
    val_file = get_true_file(val_file)
    assert train_file.endswith(".jsonl")
    assert val_file.endswith(".jsonl")
    logging.set_verbosity_info()

    config = json.loads(jsonnet_evaluate_file(config_file))

    print("Fetching data...")
    train_records = [r for r in read_tg_jsonl(train_file) if random.random() <= train_sample_rate]
    val_records = [r for r in read_tg_jsonl(val_file) if random.random() <= val_sample_rate]

    print("Building datasets...")
    model_path = config.pop("model_path")
    tokenizer = AutoTokenizer.from_pretrained(model_path, do_lower_case=False)

    max_tokens_text = config.pop("max_tokens_text", 196)
    max_tokens_title = config.pop("max_tokens_title", 48)

    train_dataset = GenTitleDataset(
        train_records,
        tokenizer,
        max_tokens_text=max_tokens_text,
        max_tokens_title=max_tokens_title)

    val_dataset = GenTitleDataset(
        val_records,
        tokenizer,
        max_tokens_text=max_tokens_text,
        max_tokens_title=max_tokens_title)

    print("Initializing model...")
    cls = BottleneckEncoderDecoderModel if enable_bottleneck else EncoderDecoderModel
    if from_pretrained:
        model = cls.from_pretrained(from_pretrained)
    else:
        model = cls.from_encoder_decoder_pretrained(model_path, model_path)

    print("Training model...")
    batch_size = config.pop("batch_size", 8)
    eval_steps = config.pop("eval_steps", 10000)
    save_steps = config.pop("save_steps", 10000)
    logging_steps = config.pop("logging_steps", 100)
    learning_rate = config.pop("learning_rate", 5e-05)
    warmup_steps = config.pop("warmup_steps", 2000)
    num_train_epochs = config.pop("num_train_epochs", 5)
    training_args = TrainingArguments(
        output_dir=output_model_path,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        evaluate_during_training=True,
        do_train=True,
        do_eval=True,
        overwrite_output_dir=False,
        logging_steps=logging_steps,
        save_steps=save_steps,
        eval_steps=eval_steps,
        learning_rate=learning_rate,
        warmup_steps=warmup_steps,
        save_total_limit=1,
        num_train_epochs=num_train_epochs
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
    )

    trainer.train(checkpoint)
    model.save_pretrained(output_model_path)
Esempio n. 8
0
def train_style_gen_title(
        run_name: str,
        config_file: str,
        train_file: str,
        dataset_type: str,
        output_model_path: str,
        from_pretrained: str = None,
        checkpoint: str = None
):
    logging.set_verbosity_info()
    config = json.loads(jsonnet_evaluate_file(config_file))
    init_wandb(run_name, config)

    agency_list = config['agency_list']
    print('Agency list:', agency_list)

    tokenizer_model_path = config["tokenizer_model_path"]
    tokenizer = BertTokenizer.from_pretrained(tokenizer_model_path, do_lower_case=False, do_basic_tokenize=False)

    max_tokens_text = config["max_tokens_text"]
    max_tokens_title = config["max_tokens_title"]

    print("Initializing model...")

    if from_pretrained:
        model = EncoderDecoderModel.from_pretrained(from_pretrained)
    else:
        enc_model_path = config["enc_model_path"]
        dec_model_path = config["dec_model_path"]
        model = EncoderDecoderModel.from_encoder_decoder_pretrained(enc_model_path, dec_model_path)

    print("Fetching data...")
    if dataset_type == 'tg':
        all_records = [r for r in tqdm.tqdm(tg_reader(train_file))]
    elif dataset_type == 'lenta-ria':
        lenta_records = [r for r in tqdm.tqdm(lenta_reader(os.path.join(train_file, 'lenta/lenta-ru-news.train.csv')))]
        lenta_records.extend(
            [r for r in tqdm.tqdm(lenta_reader(os.path.join(train_file, 'lenta/lenta-ru-news.val.csv')))]
        )

        ria_records = [r for r in tqdm.tqdm(ria_reader(os.path.join(train_file, 'ria/ria.shuffled.train.json')))]
        ria_records.extend(
            [r for r in tqdm.tqdm(ria_reader(os.path.join(train_file, 'ria/ria.shuffled.val.json')))]
        )

        random.shuffle(ria_records)

        all_records = [r for r in lenta_records if r['date'][:4] in ['2010', '2011', '2012', '2013', '2014']] + \
            ria_records[:220000]

        random.shuffle(all_records)

    print("Building datasets...")

    agency_to_special_token_id = {a: tokenizer.vocab[f'[unused{i+1}]'] for i, a in enumerate(agency_list)}

    full_dataset = AgencyTitleDatasetGeneration(
        all_records, tokenizer, 
        filter_agencies=list(agency_to_special_token_id.keys()), agency_to_special_token_id=agency_to_special_token_id,
        max_tokens_text=max_tokens_text, max_tokens_title=max_tokens_title
    )

    train_size = int(0.93 * len(full_dataset))
    train_dataset, val_dataset = torch.utils.data.random_split(full_dataset,
                                                               [train_size, len(full_dataset) - train_size])

    print(f"Train dataset length = {len(train_dataset)}\nVal dataset length = {len(val_dataset)}")
    wandb.summary.update({
        'Train dataset size': len(train_dataset),
        'Test dataset size': len(val_dataset),
    })

    print("Training model...")
    batch_size = config["batch_size"]
    eval_steps = config["eval_steps"]
    save_steps = config["save_steps"]
    logging_steps = config["logging_steps"]
    enc_lr = config["enc_lr"]
    dec_lr = config["dec_lr"]
    warmup_steps = config["num_warmup_steps"]
    max_steps = config["max_steps"]
    gradient_accumulation_steps = config["gradient_accumulation_steps"]

    opt = get_separate_lr_optimizer(model, enc_lr, dec_lr, warmup_steps, max_steps)

    training_args = TrainingArguments(
        output_dir=output_model_path,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        evaluation_strategy='steps',
        do_train=True,
        do_eval=True,
        overwrite_output_dir=False,
        logging_steps=logging_steps,
        save_steps=save_steps,
        eval_steps=eval_steps,
        save_total_limit=2,
        max_steps=max_steps,
        report_to='wandb',
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        optimizers=opt,
    )

    trainer.train(checkpoint)
    model.save_pretrained(output_model_path)
Esempio n. 9
0
def train_text2title(config_file: str, train_file: str, val_file: str,
                     train_sample_rate: float, val_sample_rate: float,
                     output_title_model_path: str, output_text_model_path: str,
                     random_seed: int, neptune_project: str):
    seed_everything(random_seed)

    train_file = get_true_file(train_file)
    val_file = get_true_file(val_file)
    assert train_file.endswith(".jsonl")
    assert val_file.endswith(".jsonl")

    config = json.loads(jsonnet_evaluate_file(config_file))

    print("Loading vectors...")
    ft_model_path = config.pop("ft_vector_model_path",
                               "models/fasttext/ru_vectors_v3.bin")
    ft_model = ft_load_model(ft_model_path)

    print("Fetching data...")
    train_records = [
        r for r in read_tg_jsonl(train_file)
        if random.random() <= train_sample_rate
    ]
    val_records = [
        r for r in read_tg_jsonl(val_file)
        if random.random() <= val_sample_rate
    ]

    print("Building datasets...")
    max_words = config.get("max_words", 150)
    batch_size = config.get("batch_size", 64)
    num_workers = config.get("num_workers", 5)
    train_data = Text2TitleDataset(train_records,
                                   ft_model,
                                   max_words=max_words)
    train_sampler = RandomSampler(train_data)
    train_loader = DataLoader(train_data,
                              sampler=train_sampler,
                              batch_size=batch_size,
                              num_workers=num_workers)

    val_data = Text2TitleDataset(val_records, ft_model, max_words=max_words)
    val_loader = DataLoader(val_data,
                            batch_size=batch_size,
                            num_workers=num_workers)

    print("Training model...")
    epochs = config.get("epochs", 100)
    patience = config.get("patience", 4)
    model = Text2TitleModel()
    early_stop_callback = EarlyStopping(monitor="val_loss",
                                        min_delta=0.0,
                                        patience=patience,
                                        verbose=True,
                                        mode="min")
    logger = False
    neptune_api_token = os.getenv("NEPTUNE_API_TOKEN")
    if neptune_project and neptune_api_token:
        params = copy.copy(config)
        params["train_sample_rate"] = train_sample_rate
        params["val_sample_rate"] = val_sample_rate
        params["train_file"] = train_file
        params["val_file"] = val_file
        logger = NeptuneLogger(
            api_key=neptune_api_token,
            project_name=neptune_project,
            experiment_name="Fasttext text2title",
            tags=["training", "pytorch-lightning", "text2title"],
            params=params)
    trainer = Trainer(gpus=0,
                      checkpoint_callback=False,
                      accumulate_grad_batches=1,
                      max_epochs=epochs,
                      callbacks=[early_stop_callback],
                      val_check_interval=1.0,
                      progress_bar_refresh_rate=100,
                      deterministic=True,
                      logger=logger)
    trainer.fit(model, train_loader, val_loader)
    model.save(output_title_model_path, output_text_model_path)
Esempio n. 10
0
def train_gen_title(run_name: str,
                    config_file: str,
                    train_file: str,
                    train_fraq: float,
                    output_model_path: str,
                    from_pretrained: str = None,
                    checkpoint: str = None):
    logging.set_verbosity_info()
    config = json.loads(jsonnet_evaluate_file(config_file))

    init_wandb(run_name, config)

    tokenizer_model_path = config["tokenizer_model_path"]
    tokenizer = BertTokenizer.from_pretrained(tokenizer_model_path,
                                              do_lower_case=False,
                                              do_basic_tokenize=False)

    max_tokens_text = config["max_tokens_text"]
    max_tokens_title = config["max_tokens_title"]

    full_dataset = LentaRiaDataset(train_file, tokenizer, max_tokens_text,
                                   max_tokens_title)

    print("Initializing model...")
    if from_pretrained:
        model = EncoderDecoderModel.from_pretrained(from_pretrained)
    else:
        enc_model_path = config["enc_model_path"]
        dec_model_path = config["dec_model_path"]
        model = EncoderDecoderModel.from_encoder_decoder_pretrained(
            enc_model_path, dec_model_path)

    train_size = int(train_fraq * len(full_dataset))

    train_dataset, val_dataset = \
            torch.utils.data.random_split(full_dataset, [train_size, len(full_dataset) - train_size])

    wandb.summary.update({
        'Train dataset size': len(train_dataset),
        'Val dataset size': len(val_dataset),
    })

    print("Training model...")
    batch_size = config["batch_size"]
    eval_steps = config["eval_steps"]
    save_steps = config["save_steps"]
    logging_steps = config["logging_steps"]
    enc_lr = config["enc_lr"]
    dec_lr = config["dec_lr"]
    warmup_steps = config["num_warmup_steps"]
    max_steps = config["max_steps"]
    gradient_accumulation_steps = config["gradient_accumulation_steps"]

    opt = get_separate_lr_optimizer(model, enc_lr, dec_lr, warmup_steps,
                                    max_steps)

    training_args = TrainingArguments(
        output_dir=output_model_path,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        evaluation_strategy='steps',
        do_train=True,
        do_eval=True,
        overwrite_output_dir=False,
        logging_steps=logging_steps,
        save_steps=save_steps,
        eval_steps=eval_steps,
        save_total_limit=1,
        max_steps=max_steps,
        report_to='wandb',
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        optimizers=opt,
    )

    trainer.train(checkpoint)
    model.save_pretrained(output_model_path)
Esempio n. 11
0
def train_discriminator(
    run_name: str,
    model_path: str,
    config_file: str,
    train_file: str,
    train_fraq: float,
    dataset_type: str,
    output_model_path: str,
):
    logging.set_verbosity_info()
    config = json.loads(jsonnet_evaluate_file(config_file))
    init_wandb(run_name, config)

    agency_list = config['agency_list']
    print('Agency list:', agency_list)

    max_tokens_text = config["max_tokens_text"]
    max_tokens_title = config["max_tokens_title"]

    tokenizer_model_path = config["tokenizer_model_path"]
    tokenizer = BertTokenizer.from_pretrained(tokenizer_model_path,
                                              do_lower_case=False,
                                              do_basic_tokenize=False)

    print("Fetching data...")
    if dataset_type == 'tg':
        all_records = [
            r for r in tqdm.tqdm(tg_reader(train_file, agency_list))
        ]
        full_dataset = AgencyTitleDatasetClassification(
            all_records,
            tokenizer,
            agency_list,
            max_tokens_text=max_tokens_text,
            max_tokens_title=max_tokens_title)
    elif dataset_type == 'lenta-ria':
        lenta_records = [
            r for r in tqdm.tqdm(
                lenta_reader(
                    os.path.join(train_file, 'lenta/lenta-ru-news.train.csv')))
        ]
        lenta_records.extend([
            r for r in tqdm.tqdm(
                lenta_reader(
                    os.path.join(train_file, 'lenta/lenta-ru-news.val.csv')))
        ])

        ria_records = [
            r for r in tqdm.tqdm(
                ria_reader(
                    os.path.join(train_file, 'ria/ria.shuffled.train.json')))
        ]
        ria_records.extend([
            r for r in tqdm.tqdm(
                ria_reader(
                    os.path.join(train_file, 'ria/ria.shuffled.val.json')))
        ])

        records = [
            r for r in reader(
                '/home/aobuhtijarov/datasets/full_lenta_ria.test.jsonl')
        ]

        filter_lenta = [{
            'text': r['lenta_text'],
            'title': r['lenta_title'],
            'agency': 'lenta.ru',
            'date': r['lenta_date']
        } for r in records]

        filter_ria = [{
            'text': r['ria_text'],
            'title': r['ria_title'],
            'agency': 'РИА Новости',
            'date': r['lenta_date']
        } for r in records]

        lenta_filter_titles = set(x['title'] for x in filter_lenta)
        ria_filter_titles = set(x['title'] for x in filter_ria)
        lenta_records = [
            r for r in lenta_records if r['title'] not in lenta_filter_titles
        ]
        ria_records = [
            r for r in ria_records if r['title'] not in ria_filter_titles
        ]

        random.shuffle(ria_records)
        lenta_records = [
            r for r in lenta_records
            if r['date'][:4] in ['2010', '2011', '2012', '2013', '2014']
        ]

        all_records = lenta_records + ria_records[:len(lenta_records)]

        random.shuffle(all_records)
        full_dataset = AgencyTitleDatasetClassification(
            all_records,
            tokenizer,
            agency_list,
            max_tokens_text=max_tokens_text,
            max_tokens_title=max_tokens_title)
    elif dataset_type == 'lenta-ria-clusters':
        full_dataset = LentaRiaDatasetClassification(train_file, tokenizer,
                                                     agency_list,
                                                     max_tokens_text,
                                                     max_tokens_title)

    print("Building datasets...")

    train_size = int(train_fraq * len(full_dataset))
    test_size = int((1 - train_fraq) * 0.5 * len(full_dataset))

    train_dataset, test_dataset, eval_dataset = \
        torch.utils.data.random_split(full_dataset, [train_size, test_size, len(full_dataset) - train_size - test_size])

    wandb.summary.update({
        'Train dataset size': len(train_dataset),
        'Val dataset size': len(eval_dataset),
        'Test dataset size': len(test_dataset),
    })

    print("Initializing model...")
    model = AutoModelForSequenceClassification.from_pretrained(
        model_path, num_labels=len(agency_list))

    print("Training model...")
    batch_size = config["batch_size"]
    logging_steps = config["logging_steps"]
    save_steps = config["save_steps"]
    eval_steps = config["eval_steps"]
    warmup_steps = config["num_warmup_steps"]
    gradient_accumulation_steps = config["gradient_accumulation_steps"]
    max_steps = config["max_steps"]
    lr = config["learning_rate"]

    training_args = TrainingArguments(
        output_dir=output_model_path,
        do_train=True,
        do_eval=True,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        evaluation_strategy='steps',
        learning_rate=lr,
        warmup_steps=warmup_steps,
        overwrite_output_dir=False,
        logging_steps=logging_steps,
        eval_steps=eval_steps,
        save_steps=save_steps,
        max_steps=max_steps,
        save_total_limit=1,
        weight_decay=0.01,
        report_to='wandb',
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    wandb.summary.update(
        {'Test Evaluation': trainer.evaluate(eval_dataset=test_dataset)})
    model.save_pretrained(output_model_path)
Esempio n. 12
0
def evaluate_style_gen_title(
    existing_run_name: str,
    existing_run_id: str,
    config_file: str,
    gen_model_file: str,
    discr_model_file: str,
    test_file: str,
    test_sample_rate: float,
):
    logging.set_verbosity_info()
    init_wandb(existing_run_name, None, existing_run_id)

    config = json.loads(jsonnet_evaluate_file(config_file))

    tokenizer_model_path = config["tokenizer_model_path"]
    tokenizer = BertTokenizer.from_pretrained(tokenizer_model_path, do_lower_case=False, do_basic_tokenize=False)

    max_tokens_text = config["max_tokens_text"]
    max_tokens_title = config["max_tokens_title"]
    setattr(tokenizer, 'max_tokens_text', max_tokens_text)

    batch_size = config["batch_size"]

    print("Loading model...")
    model = EncoderDecoderModel.from_pretrained(gen_model_file)
    model.eval()
    model.cuda()

    agency_list = config['agency_list']
    discriminator = AutoModelForSequenceClassification.from_pretrained(discr_model_file, num_labels=len(agency_list)).cuda()
    
    print("Fetching TG data...")
    test_records = [r for r in tqdm.tqdm(tg_reader(test_file)) 
        if random.random() <= test_sample_rate]
    
    print("Building datasets...")
    
    
    agency_to_special_token_id = {
        a: tokenizer.vocab[f'[unused{i+1}]'] for i, a in enumerate(agency_list)
    }

    agency_to_target = {a: i for i, a in enumerate(sorted(agency_list))}

    test_dataset = AgencyTitleDatasetGeneration(
        test_records, tokenizer,
        filter_agencies=list(agency_to_special_token_id.keys()),
        agency_to_special_token_id=agency_to_special_token_id,
        max_tokens_text=max_tokens_text, max_tokens_title=max_tokens_title
    )

    print('Dataset size:', len(test_dataset))

    y_pred = []
    y_true = []

    for i in tqdm.trange(0, len(test_dataset), batch_size):
        data = test_dataset[i]
        for k in tuple(data.keys()):
            if k not in ('input_ids', 'attention_mask'):
                del data[k]
            else:
                data[k] = data[k].unsqueeze(0)

        for j in range(i + 1, min(i + batch_size, len(test_dataset))):
            for k in data.keys():
                data[k] = torch.cat((data[k], test_dataset[j][k].unsqueeze(0)), dim=0)

        y_true.extend([ agency_to_target[test_dataset.get_strings(j)['agency']]
            for j in range(i, min(i + batch_size, len(test_dataset)))])

        data['input_ids'] = data['input_ids'].cuda()
        data['attention_mask'] = data['attention_mask'].cuda()

        output_ids = model.generate(
            **data,
            decoder_start_token_id=model.config.decoder.pad_token_id,
            min_length=7,
            max_length=20,
            num_beams=6
        )

        preds = [
            tokenizer.decode(first_sent(x, tokenizer.sep_token_id), skip_special_tokens=True) for x in output_ids
        ]

        for title in preds:
            inp = tokenizer(title, 
                add_special_tokens=True, max_length=max_tokens_title,
                padding='max_length', truncation=True
            )

            logits = discriminator(input_ids=torch.LongTensor(inp['input_ids']).cuda().unsqueeze(0), 
                                   attention_mask=torch.LongTensor(inp['attention_mask']).cuda().unsqueeze(0))[0]
            y_pred.append(torch.argmax(logits).item())

    wandb.summary.update({
        'D-Style': classification_report(y_true, y_pred, output_dict=True)
    })
Esempio n. 13
0
def perform_clustering_eval(
    existing_run_name: str,
    existing_run_id: str,
    config_file,
    eval_model_file,
    clustering_data_file,
    gold_markup_file,
    enable_bottleneck,
    text_to_vec_func
):
    logging.set_verbosity_info()
    init_wandb(existing_run_name, None, existing_run_id)

    config = json.loads(jsonnet_evaluate_file(config_file))

    tokenizer_model_path = config["tokenizer_model_path"]
    tokenizer = BertTokenizer.from_pretrained(tokenizer_model_path, do_lower_case=False, do_basic_tokenize=False)

    max_tokens_text = config["max_tokens_text"]

    print("Loading model...")
    cls = BottleneckEncoderDecoderModel if enable_bottleneck else EncoderDecoderModel
    model = cls.from_pretrained(eval_model_file)
    model.eval()
    model.cuda()

    gold_markup = get_gold_markup(gold_markup_file)

    url2record, filename2url = get_data_to_cluster(clustering_data_file)
    setattr(tokenizer, 'max_tokens_text', max_tokens_text)
    text_to_vector_func = get_text_to_vector_func(text_to_vec_func, model, tokenizer)

    print('Calculating embeddings...')
    embeds = np.zeros((len(url2record.items()), 768))

    total_articles = len(url2record.items())

    for i, (url, record) in tqdm.tqdm(enumerate(url2record.items()), total=total_articles):
        text = record["title"] + ' ' + record["text"]
        text = text.lower().replace('\xa0', ' ').strip()
        embeds[i] = text_to_vector_func(text).detach().cpu().numpy().ravel()

    print('Embeds shape =', embeds.shape)

    print('Searching for optimal threshold')
    domain = np.logspace(-3, 0, 11)
    quals = [get_quality(embeds, gold_markup, url2record, dist)
            for dist in tqdm.tqdm(domain, total=11)]

    closer_domain = np.linspace(
        domain[max(0, np.argmax(quals) - 2)],
        domain[min(np.argmax(quals) + 3, len(domain) - 1)], 
        9)
    closer_quals = [get_quality(embeds, gold_markup, url2record, dist)
                    for dist in tqdm.tqdm(closer_domain, total=9)]

    best_dist = closer_domain[np.argmax(closer_quals)]
    print('Best distance:', best_dist)

    get_quality(embeds, gold_markup, url2record, best_dist, print_result=True)
    log_to_wandb(embeds, gold_markup, best_dist, url2record, text_to_vec_func)
Esempio n. 14
0
def make_inference_and_save(
    config_file,
    eval_model_file,
    test_file,
    test_sample_rate,
    enable_bottleneck,
    cluster_model_file,
    clustering_dist_threshold,
    out_path_prefix,
    dataset_type,
    style_model_eval,
):
    config = json.loads(jsonnet_evaluate_file(config_file))

    tokenizer_model_path = config["tokenizer_model_path"]
    tokenizer = BertTokenizer.from_pretrained(tokenizer_model_path,
                                              do_lower_case=False,
                                              do_basic_tokenize=False)

    max_tokens_text = config["max_tokens_text"]
    max_tokens_title = config["max_tokens_title"]
    setattr(tokenizer, 'max_tokens_text', max_tokens_text)

    batch_size = config["batch_size"]

    print("Loading model...")
    cls = BottleneckEncoderDecoderModel if enable_bottleneck else EncoderDecoderModel
    model = cls.from_pretrained(eval_model_file)
    model.eval()
    model.cuda()

    if cluster_model_file:
        test_sample_rate = 1.
        filter_dates = ('2020-05-12', )
    else:
        filter_dates = None

    if dataset_type == 'ria':
        print("Fetching RIA data...")
        test_records = [
            r for r in tqdm.tqdm(ria_reader(test_file))
            if random.random() <= test_sample_rate
        ]
    else:
        print("Fetching TG data...")
        test_records = [
            r
            for r in tqdm.tqdm(tg_reader(test_file, filter_dates=filter_dates))
            if random.random() <= test_sample_rate
        ]

    print("Building datasets...")

    if style_model_eval:
        agency_list = config['agency_list']
        agency_to_special_token_id = {
            a: tokenizer.vocab[f'[unused{i+1}]']
            for i, a in enumerate(agency_list)
        }

        test_dataset = AgencyTitleDatasetGeneration(
            test_records,
            tokenizer,
            filter_agencies=None,
            agency_to_special_token_id=agency_to_special_token_id,
            max_tokens_text=max_tokens_text,
            max_tokens_title=max_tokens_title)
    else:
        test_dataset = GenTitleDataset(test_records,
                                       tokenizer,
                                       max_tokens_text=max_tokens_text,
                                       max_tokens_title=max_tokens_title)

    print('Dataset size:', len(test_dataset))

    if cluster_model_file:
        from utils.clustering_utils import get_text_to_vector_func
        clusterer = Clusterer(
            get_text_to_vector_func(
                'bert-FirstCLS',
                BottleneckEncoderDecoderModel.from_pretrained(
                    cluster_model_file), tokenizer),
            test_dataset,
            clustering_dist_threshold,
            dates=filter_dates,
        )

        clusterer.perform_clustering()

    with open(out_path_prefix + 'prediction.txt', 'w', encoding='utf-8') as pf, \
            open(out_path_prefix + 'gold.txt', 'w', encoding='utf-8') as gf:
        for i in tqdm.trange(0, len(test_dataset), batch_size):
            data = test_dataset[i]
            for k in tuple(data.keys()):
                if k not in ('input_ids', 'attention_mask'):
                    del data[k]
                else:
                    data[k] = data[k].unsqueeze(0)

            for j in range(i + 1, min(i + batch_size, len(test_dataset))):
                for k in data.keys():
                    data[k] = torch.cat(
                        (data[k], test_dataset[j][k].unsqueeze(0)), dim=0)

            data['input_ids'] = data['input_ids'].cuda()
            data['attention_mask'] = data['attention_mask'].cuda()

            output_ids = model.generate(
                **data,
                decoder_start_token_id=model.config.decoder.pad_token_id,
                min_length=7,
                max_length=20,
                num_beams=6)

            preds = [
                tokenizer.decode(first_sent(x, tokenizer.sep_token_id),
                                 skip_special_tokens=True) for x in output_ids
            ]

            for j in range(i, min(i + batch_size, len(test_dataset))):
                if cluster_model_file:
                    refs = []
                    for r in clusterer.get_cluster_records(j):
                        refs.append(r['title'])

                    gf.write(' s_s '.join(refs) + '\n')
                else:
                    gf.write(test_dataset.get_strings(j)['title'] + '\n')
                pf.write(preds[j - i] + '\n')
Esempio n. 15
0
def distil_embeddings(config_file: str, train_file: str, val_file: str,
                      train_sample_rate: float, val_sample_rate: float,
                      input_model_path: str, output_model_path: str,
                      random_seed: int, neptune_project: str,
                      saved_embeddings: str):
    seed_everything(random_seed)

    train_file = get_true_file(train_file)
    val_file = get_true_file(val_file)
    assert train_file.endswith(".jsonl")
    assert val_file.endswith(".jsonl")

    config = json.loads(jsonnet_evaluate_file(config_file))

    print("Fetching data...")
    train_records = [
        r for r in parse_tg_jsonl(train_file)
        if random.random() <= train_sample_rate
    ]
    val_records = [
        r for r in parse_tg_jsonl(val_file)
        if random.random() <= val_sample_rate
    ]

    tokenizer = AutoTokenizer.from_pretrained(input_model_path)
    max_tokens_count = config.get("max_tokens_count", 196)
    if not saved_embeddings or not os.path.isfile(saved_embeddings):
        print("Loading teacher model...")
        input_model = AutoModel.from_pretrained(input_model_path)

        print("Saving embeddings...")
        url2text = {
            r["url"]: r["text"]
            for r in itertools.chain(train_records, val_records)
        }
        urls = []
        embeddings = []
        batch_urls = []
        batch_texts = []
        batch_size = 8
        for url, text in tqdm(url2text.items()):
            batch_urls.append(url)
            batch_texts.append(text)
            if len(batch_urls) == batch_size:
                urls.extend(batch_urls)
                batch_embeddings = calc_batch_embeddings(
                    batch_texts, tokenizer, input_model, max_tokens_count)
                for embedding in batch_embeddings:
                    embeddings.append(embedding)
                batch_urls = []
                batch_texts = []
        if batch_urls:
            urls.extend(batch_urls)
            batch_embeddings = calc_batch_embeddings(batch_texts, tokenizer,
                                                     input_model,
                                                     max_tokens_count)
            for embedding in batch_embeddings:
                embeddings.append(embedding)
        embeddings = torch.tensor(embeddings)
        data = {"urls": urls, "embeddings": embeddings}
        torch.save(data, saved_embeddings)
    else:
        print("Loading embeddings...")
        data = torch.load(saved_embeddings)

    url2num = {url: num for num, url in enumerate(data["urls"])}
    num2embedding = data["embeddings"]

    batch_size = config.get("batch_size", 32)
    num_workers = config.get("num_workers", 5)
    train_dataset = EmbeddingsAsTargetDataset(train_records, url2num,
                                              num2embedding, tokenizer,
                                              max_tokens_count)
    train_sampler = RandomSampler(train_dataset)
    train_loader = DataLoader(train_dataset,
                              sampler=train_sampler,
                              batch_size=batch_size,
                              num_workers=num_workers)

    val_dataset = EmbeddingsAsTargetDataset(val_records, url2num,
                                            num2embedding, tokenizer,
                                            max_tokens_count)
    val_loader = DataLoader(val_dataset,
                            batch_size=batch_size,
                            num_workers=num_workers)

    patience = config.get("patience", 4)
    epochs = config.get("epochs", 5)
    gradient_clip_val = config.get("gradient_clip_val", 1.0)

    logger = False
    neptune_api_token = os.getenv("NEPTUNE_API_TOKEN")
    if neptune_project and neptune_api_token:
        params = copy.copy(config)
        params["train_sample_rate"] = train_sample_rate
        params["val_sample_rate"] = val_sample_rate
        params["train_file"] = train_file
        params["val_file"] = val_file
        logger = NeptuneLogger(
            api_key=neptune_api_token,
            project_name=neptune_project,
            experiment_name="Distil embeddings",
            tags=["training", "pytorch-lightning", "distil"],
            params=params)

    lightning_model = DistilEmbeddingBertLightning(config)
    early_stop_callback = EarlyStopping(monitor="val_loss",
                                        min_delta=0.0,
                                        patience=patience,
                                        verbose=True,
                                        mode="min")
    trainer = Trainer(gpus=0,
                      checkpoint_callback=False,
                      accumulate_grad_batches=1,
                      max_epochs=epochs,
                      callbacks=[early_stop_callback],
                      val_check_interval=1.0,
                      gradient_clip_val=gradient_clip_val,
                      deterministic=True,
                      logger=logger)
    trainer.fit(lightning_model, train_loader, val_loader)