def setUp(self) -> None:
        os.environ["WANDB_DISABLED"] = "true"
        os.environ["WANDB_WATCH"] = "false"
        self.env = get_env()

        self.cache_dir = '../data/transformers_cache'
        self.sample_text = ' '.join(['Hello world! '] * 10)
        self.num_labels = 5
Esempio n. 2
0
    def test_cls_init(self):
        env = get_env()

        exp = Experiment(
            # random_seed=0,
            epochs=1,
            model_cls='models.transformers.JointBERT',
            model_params={
                'bert_model_path': env['bert_dir'] + '/bert-base-cased',
                'labels_count': 3,
            },
            loss_func_cls='torch.nn.BCELoss',  # loss,
            model_output_to_loss_input=lambda ys: ys.double(),
            data_helper_cls='wiki.data_helpers.JointBERTWikiDataHelper',
            data_helper_params={
                'wiki_relations_path': '../wiki/relations.csv',
                'wiki_articles_path': '../wiki/docs.pickle',
                'labels': ['employer', 'country_of_citizenship'],
                # 'employer' # 'capital' # 'country_of_citizenship' #'educated_at' # 'opposite_of'
                'label_col': 'relation_name',
                'negative_sampling_ratio': 1.,
                'train_test_split': 0.7,
                'max_seq_length': 512,
                'train_batch_size': 4,
                'test_batch_size': 4,
                'bert_model_path':
                '/Volumes/data/repo/data/bert/bert-base-cased',
                # 'bert_tokenizer_cls': '',
                'bert_tokenizer_params': {
                    'do_lower_case': False,
                },
                'df_limit': 3,
            },
            tqdm_cls='tqdm.tqdm',
            output_dir='../output',
        )

        assert isinstance(exp.model, JointBERT)
        assert isinstance(exp.data_helper, JointBERTWikiDataHelper)
        assert isinstance(exp.loss_func, BCELoss)
        assert tqdm == exp.tqdm_cls

        print(flatten(exp.to_dict()))

        exp.run()
Esempio n. 3
0
File: data_cli.py Progetto: j5bd/q
def build_transformers_vectors(hf_dataset: str,
                               model_name_or_path: str,
                               output_path: str,
                               pooling: str,
                               batch_size: int = 16,
                               override: bool = False):
    """

    $ ./data_cli.py build_transformers_vectors paperswithcode_aspects scibert-scivocab-uncased ./output/scibert-cls --pooling=cls --batch_size=16

    :param hf_dataset:
    :param model_name_or_path:
    :param output_path:
    :param pooling:
    :param override:
    :return:
    """

    env = get_env()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    pooling_strategies = ['cls', 'mean']

    if os.path.exists(output_path) and not override:
        logger.error(f'Output file exists already: {output_path}')
        sys.exit(1)

    if pooling not in pooling_strategies:
        raise ValueError(f'Invalid pooling: {pooling}')

    # Model path from env
    if not os.path.exists(model_name_or_path) and os.path.exists(
            os.path.join(env['bert_dir'], model_name_or_path)):
        model_name_or_path = os.path.join(env['bert_dir'], model_name_or_path)

    # Dataset
    docs_ds = load_dataset(get_local_hf_dataset_path(hf_dataset),
                           name='docs',
                           cache_dir='./data/nlp_cache',
                           split='docs')
    logger.info(f'Documents loaded: {len(docs_ds):,}')

    # Model
    model = AutoModel.from_pretrained(model_name_or_path)
    model = model.to(device)

    # Tokenize docs
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

    texts = [doc['title'] + ': ' + doc['abstract'] for doc in docs_ds]

    inputs = tokenizer(texts,
                       add_special_tokens=True,
                       return_tensors='pt',
                       padding=True,
                       max_length=model.config.max_position_embeddings,
                       truncation=True,
                       return_token_type_ids=False,
                       return_attention_mask=True)

    ds = TensorDataset(inputs['input_ids'], inputs['attention_mask'])
    dl = DataLoader(ds, shuffle=False, batch_size=batch_size)

    # Vectors
    doc_model = KeyedVectors(vector_size=model.config.hidden_size)

    with torch.no_grad():
        for batch_idx, batch_data in enumerate(tqdm(dl, desc='Inference')):
            batch_data = tuple(t.to(device) for t in batch_data)

            outputs = model(*batch_data, return_dict=True)

            if pooling == 'cls':
                batch_embeddings = outputs['pooler_output'].detach().cpu(
                ).numpy()
            elif pooling == 'mean':
                batch_embeddings = np.mean(
                    outputs['last_hidden_state'].detach().cpu().numpy(),
                    axis=1)
            else:
                raise NotImplementedError()

            batch_ids = docs_ds[batch_idx * batch_size:batch_idx * batch_size +
                                batch_size]['paper_id']
            doc_model.add(batch_ids, batch_embeddings)

    # Save to disk
    doc_model.save_word2vec_format(output_path)

    logger.info('Done')
Esempio n. 4
0
    def __init__(self, *args, **kwargs):
        os.environ["WANDB_DISABLED"] = "true"
        os.environ["WANDB_WATCH"] = "false"

        super().__init__(*args, **kwargs)
        self.env = get_env()
Esempio n. 5
0
import os

from tqdm import tqdm
from transformers import BertModel, BertTokenizer, RobertaModel, RobertaTokenizer, XLNetModel, XLNetTokenizer

from experiments.environment import get_env
from experiments.predefined import update

env = get_env()
split_dir = './output/splits10k/'
#'has_cause', # drop has cause (P828)
labels = [
    'different_from', 'employer', 'facet_of', 'country_of_citizenship',
    'opposite_of', 'has_quality', 'symptoms', 'has_effect', 'educated_at'
]
dh_params = dict(
    train_dataframe_path=split_dir + 'train.csv',
    test_dataframe_path=split_dir + 'test.csv',
    wiki_relations_path=None,
    wiki_articles_path=os.path.join(
        env['datasets_dir'],
        '/wikipedia_en/dumps/enwiki-20191101-pages-articles.weighted.10k.jsonl'
    ),
    train_batch_size=4,
    test_batch_size=5,
    include_section_title=False,
    labels=labels,
    label_col='relation_name',
    max_seq_length=512,
    workers=env['workers'],
)
Esempio n. 6
0
def main():
    # Auto-environment
    env = get_env()

    parser = HfArgumentParser(
        (ModelArguments, TrainingArguments, ExperimentArguments))
    model_args, training_args, experiment_args = parser.parse_args_into_dataclasses(
    )

    # Adjust output with folds and model name
    #TODO disabled
    # training_args.output_dir = os.path.join(training_args.output_dir, str(experiment_args.cv_fold), model_args.get_model_name())

    # Model path from env
    if not os.path.exists(model_args.model_name_or_path) and os.path.exists(
            os.path.join(env['bert_dir'], model_args.model_name_or_path)):
        model_args.model_name_or_path = os.path.join(
            env['bert_dir'], model_args.model_name_or_path)

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%Y-%m-%d %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    # Dataset args
    label_classes = get_label_classes_from_hf_dataset(
        get_local_hf_dataset_path(experiment_args.hf_dataset))
    num_labels = len(label_classes)

    if num_labels > 1 and experiment_args.binary_classification:
        # In binary classification we have only single label (with y=[0;1])
        num_labels = 1
        logger.warning(f'Forcing label classes to binary: {label_classes}')

    columns = ['input_ids', 'attention_mask', 'token_type_ids',
               'labels']  # Input to transformers.forward

    # Build dataset for splits
    train_ds = load_dataset(
        get_local_hf_dataset_path(experiment_args.hf_dataset),
        name='relations',
        cache_dir=experiment_args.hf_dataset_cache_dir,
        split=get_train_split(experiment_args.aspect, experiment_args.cv_fold))
    test_ds = load_dataset(
        get_local_hf_dataset_path(experiment_args.hf_dataset),
        name='relations',
        cache_dir=experiment_args.hf_dataset_cache_dir,
        split=get_test_split(experiment_args.aspect, experiment_args.cv_fold))
    docs_ds = load_dataset(get_local_hf_dataset_path(
        experiment_args.hf_dataset),
                           name='docs',
                           cache_dir=experiment_args.hf_dataset_cache_dir,
                           split=datasets.Split('docs'))

    # Forced limit
    if experiment_args.dataset_limit > 0:
        logger.info(
            f'Train and test datasets limited to {experiment_args.dataset_limit} samples'
        )

        train_ds = Dataset(train_ds.data[:experiment_args.dataset_limit])
        test_ds = Dataset(test_ds.data[:experiment_args.dataset_limit])

    # Build ID => Doc mapping
    doc_id2doc = {doc[experiment_args.doc_id_col]: doc for doc in docs_ds}

    if model_args.model_name_or_path.startswith('baseline-rnn'):
        # Load Spacy as tokenizer
        spacy_nlp = spacy.load(experiment_args.spacy_model,
                               disable=["tagger", "ner", "textcat"])

        if experiment_args.multi_label:
            # Baseline models
            model = RNNForMultiLabelSequenceClassification(
                word_vectors=get_vectors_from_spacy_model(spacy_nlp),
                hidden_size=experiment_args.rnn_hidden_size,
                rnn=experiment_args.rnn_type,
                num_labels=num_labels,
                num_layers=experiment_args.rnn_num_layers,
                dropout=experiment_args.rnn_dropout,
            )
        else:
            raise NotImplementedError(
                'RNN baseline is only available for multi label classification'
            )

        tokenizer = None

    else:
        # Load pretrained Transformers models and tokenizers
        model_config = AutoConfig.from_pretrained(
            model_args.model_name_or_path,
            num_labels=num_labels,
            cache_dir=model_args.cache_dir)

        # No need for spacy
        spacy_nlp = None

        if 'longformer' in model_args.model_name_or_path:
            # TVM: a custom CUDA kernel implementation of our sliding window attention (works only on GPU)
            model_config.attention_mode = 'tvm'

            # override tokenizer name if not set
            if model_args.tokenizer_name is None:
                roberta_path = os.path.join(env['bert_dir'], 'roberta-base')
                model_args.tokenizer_name = roberta_path if os.path.exists(
                    roberta_path) else 'roberta-base'

                logger.info(
                    f'Overriding tokenizer: {model_args.tokenizer_name}')

            # override max length
            experiment_args.max_length = 4096

        if experiment_args.multi_label:
            model_cls = AutoModelForMultiLabelSequenceClassification
        else:
            model_cls = AutoModelForSequenceClassification

        model = model_cls.from_pretrained(model_args.model_name_or_path,
                                          config=model_config,
                                          cache_dir=model_args.cache_dir)
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.tokenizer_name
            if model_args.tokenizer_name else model_args.model_name_or_path,
            cache_dir=model_args.cache_dir,
        )

        # Set token limit if defined by model (for Longformer)
        if model.config.max_position_embeddings > 0:
            tokenizer.model_max_length = model.config.max_position_embeddings

    # Init helper
    dpt = DocRelTrainerHelper(
        id2doc=doc_id2doc,
        transformers_tokenizer=tokenizer,
        spacy_nlp=spacy_nlp,
        label_classes=label_classes,
        binary_classification=experiment_args.binary_classification,
        doc_a_col=experiment_args.doc_a_col,
        doc_b_col=experiment_args.doc_b_col,
        label_col=experiment_args.label_col,
        text_from_doc_func=get_non_empty_text_from_doc,
        classification_threshold=experiment_args.classification_threshold,
        max_length=experiment_args.max_length,
        multi_label=experiment_args.multi_label,
    )

    logger.info('Converting to features (doc mapping, tokenize, ...)')

    # Build hash from settings for caching
    data_settings_hash = hashlib.md5(
        dataclasses.asdict(experiment_args).__str__().encode("utf-8") +
        dataclasses.asdict(model_args).__str__().encode("utf-8")).hexdigest()

    train_tensor_ds = train_ds.map(
        dpt.convert_to_features,
        batched=True,
        load_from_cache_file=True,
        num_proc=int(env['workers']),
        cache_file_name=os.path.join(
            experiment_args.hf_dataset_cache_dir,
            "cache-train-" + data_settings_hash + ".arrow"))
    train_tensor_ds.set_format(type='torch', columns=columns)

    test_tensor_ds = test_ds.map(
        dpt.convert_to_features,
        batched=True,
        load_from_cache_file=True,
        num_proc=int(env['workers']),
        cache_file_name=os.path.join(
            experiment_args.hf_dataset_cache_dir,
            "cache-test-" + data_settings_hash + ".arrow"))
    test_tensor_ds.set_format(type='torch', columns=columns)

    logger.info(f'Dataset columns: {columns}')
    logger.info(f'Train sample: {train_ds[0]}')
    logger.debug(f'- as tensor: {train_tensor_ds[0]}')

    logger.info(f'Test sample: {test_ds[0]}')
    logger.debug(f'- as tensor: {test_tensor_ds[0]}')

    # Load models weights (when no training but predictions)
    model_weights_path = os.path.join(training_args.output_dir,
                                      'pytorch_model.bin')

    if not training_args.do_train and experiment_args.save_predictions:
        logger.info(
            f'Loading existing model weights from disk: {model_weights_path}')
        if os.path.exists(model_weights_path):
            model.load_state_dict(torch.load(model_weights_path))
        else:
            logger.error('Weights files does not exist!')

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_tensor_ds,
        eval_dataset=test_tensor_ds,
        data_collator=DocRelDataCollator(),
        #prediction_loss_only=False,
        compute_metrics=dpt.compute_metrics,
    )

    # Log additional (to Weights & Baises)
    if is_wandb_available():
        extra_config = {}
        extra_config.update(dataclasses.asdict(experiment_args))
        extra_config.update(dataclasses.asdict(model_args))

        wandb.config.update(extra_config, allow_val_change=True)

    if training_args.do_train:
        logger.info('Training started...')
        trainer.train()

        if isinstance(model, PreTrainedModel):
            trainer.save_model()
            tokenizer.save_pretrained(training_args.output_dir)

        elif isinstance(model, nn.Module):  # RNN model
            torch.save(model.state_dict(), model_weights_path)

    if experiment_args.save_predictions:
        logger.info('Predicting...')

        predictions = trainer.predict(test_tensor_ds)

        df = dpt.get_df_from_predictions(test_ds,
                                         docs_ds,
                                         predictions,
                                         exclude_columns=['abstract'])

        # Save results to disk
        df.to_csv(os.path.join(training_args.output_dir, 'results.csv'),
                  index=False)
        json.dump(
            predictions.metrics,
            open(os.path.join(training_args.output_dir, 'metrics.json'), 'w'))

    logger.info('Done')
 def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
     self.env = get_env()
     self.data_dir = data_dir = os.path.join(self.env['datasets_dir'],
                                             'acl-anthology')