def setUp(self) -> None: os.environ["WANDB_DISABLED"] = "true" os.environ["WANDB_WATCH"] = "false" self.env = get_env() self.cache_dir = '../data/transformers_cache' self.sample_text = ' '.join(['Hello world! '] * 10) self.num_labels = 5
def test_cls_init(self): env = get_env() exp = Experiment( # random_seed=0, epochs=1, model_cls='models.transformers.JointBERT', model_params={ 'bert_model_path': env['bert_dir'] + '/bert-base-cased', 'labels_count': 3, }, loss_func_cls='torch.nn.BCELoss', # loss, model_output_to_loss_input=lambda ys: ys.double(), data_helper_cls='wiki.data_helpers.JointBERTWikiDataHelper', data_helper_params={ 'wiki_relations_path': '../wiki/relations.csv', 'wiki_articles_path': '../wiki/docs.pickle', 'labels': ['employer', 'country_of_citizenship'], # 'employer' # 'capital' # 'country_of_citizenship' #'educated_at' # 'opposite_of' 'label_col': 'relation_name', 'negative_sampling_ratio': 1., 'train_test_split': 0.7, 'max_seq_length': 512, 'train_batch_size': 4, 'test_batch_size': 4, 'bert_model_path': '/Volumes/data/repo/data/bert/bert-base-cased', # 'bert_tokenizer_cls': '', 'bert_tokenizer_params': { 'do_lower_case': False, }, 'df_limit': 3, }, tqdm_cls='tqdm.tqdm', output_dir='../output', ) assert isinstance(exp.model, JointBERT) assert isinstance(exp.data_helper, JointBERTWikiDataHelper) assert isinstance(exp.loss_func, BCELoss) assert tqdm == exp.tqdm_cls print(flatten(exp.to_dict())) exp.run()
def build_transformers_vectors(hf_dataset: str, model_name_or_path: str, output_path: str, pooling: str, batch_size: int = 16, override: bool = False): """ $ ./data_cli.py build_transformers_vectors paperswithcode_aspects scibert-scivocab-uncased ./output/scibert-cls --pooling=cls --batch_size=16 :param hf_dataset: :param model_name_or_path: :param output_path: :param pooling: :param override: :return: """ env = get_env() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") pooling_strategies = ['cls', 'mean'] if os.path.exists(output_path) and not override: logger.error(f'Output file exists already: {output_path}') sys.exit(1) if pooling not in pooling_strategies: raise ValueError(f'Invalid pooling: {pooling}') # Model path from env if not os.path.exists(model_name_or_path) and os.path.exists( os.path.join(env['bert_dir'], model_name_or_path)): model_name_or_path = os.path.join(env['bert_dir'], model_name_or_path) # Dataset docs_ds = load_dataset(get_local_hf_dataset_path(hf_dataset), name='docs', cache_dir='./data/nlp_cache', split='docs') logger.info(f'Documents loaded: {len(docs_ds):,}') # Model model = AutoModel.from_pretrained(model_name_or_path) model = model.to(device) # Tokenize docs tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) texts = [doc['title'] + ': ' + doc['abstract'] for doc in docs_ds] inputs = tokenizer(texts, add_special_tokens=True, return_tensors='pt', padding=True, max_length=model.config.max_position_embeddings, truncation=True, return_token_type_ids=False, return_attention_mask=True) ds = TensorDataset(inputs['input_ids'], inputs['attention_mask']) dl = DataLoader(ds, shuffle=False, batch_size=batch_size) # Vectors doc_model = KeyedVectors(vector_size=model.config.hidden_size) with torch.no_grad(): for batch_idx, batch_data in enumerate(tqdm(dl, desc='Inference')): batch_data = tuple(t.to(device) for t in batch_data) outputs = model(*batch_data, return_dict=True) if pooling == 'cls': batch_embeddings = outputs['pooler_output'].detach().cpu( ).numpy() elif pooling == 'mean': batch_embeddings = np.mean( outputs['last_hidden_state'].detach().cpu().numpy(), axis=1) else: raise NotImplementedError() batch_ids = docs_ds[batch_idx * batch_size:batch_idx * batch_size + batch_size]['paper_id'] doc_model.add(batch_ids, batch_embeddings) # Save to disk doc_model.save_word2vec_format(output_path) logger.info('Done')
def __init__(self, *args, **kwargs): os.environ["WANDB_DISABLED"] = "true" os.environ["WANDB_WATCH"] = "false" super().__init__(*args, **kwargs) self.env = get_env()
import os from tqdm import tqdm from transformers import BertModel, BertTokenizer, RobertaModel, RobertaTokenizer, XLNetModel, XLNetTokenizer from experiments.environment import get_env from experiments.predefined import update env = get_env() split_dir = './output/splits10k/' #'has_cause', # drop has cause (P828) labels = [ 'different_from', 'employer', 'facet_of', 'country_of_citizenship', 'opposite_of', 'has_quality', 'symptoms', 'has_effect', 'educated_at' ] dh_params = dict( train_dataframe_path=split_dir + 'train.csv', test_dataframe_path=split_dir + 'test.csv', wiki_relations_path=None, wiki_articles_path=os.path.join( env['datasets_dir'], '/wikipedia_en/dumps/enwiki-20191101-pages-articles.weighted.10k.jsonl' ), train_batch_size=4, test_batch_size=5, include_section_title=False, labels=labels, label_col='relation_name', max_seq_length=512, workers=env['workers'], )
def main(): # Auto-environment env = get_env() parser = HfArgumentParser( (ModelArguments, TrainingArguments, ExperimentArguments)) model_args, training_args, experiment_args = parser.parse_args_into_dataclasses( ) # Adjust output with folds and model name #TODO disabled # training_args.output_dir = os.path.join(training_args.output_dir, str(experiment_args.cv_fold), model_args.get_model_name()) # Model path from env if not os.path.exists(model_args.model_name_or_path) and os.path.exists( os.path.join(env['bert_dir'], model_args.model_name_or_path)): model_args.model_name_or_path = os.path.join( env['bert_dir'], model_args.model_name_or_path) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Dataset args label_classes = get_label_classes_from_hf_dataset( get_local_hf_dataset_path(experiment_args.hf_dataset)) num_labels = len(label_classes) if num_labels > 1 and experiment_args.binary_classification: # In binary classification we have only single label (with y=[0;1]) num_labels = 1 logger.warning(f'Forcing label classes to binary: {label_classes}') columns = ['input_ids', 'attention_mask', 'token_type_ids', 'labels'] # Input to transformers.forward # Build dataset for splits train_ds = load_dataset( get_local_hf_dataset_path(experiment_args.hf_dataset), name='relations', cache_dir=experiment_args.hf_dataset_cache_dir, split=get_train_split(experiment_args.aspect, experiment_args.cv_fold)) test_ds = load_dataset( get_local_hf_dataset_path(experiment_args.hf_dataset), name='relations', cache_dir=experiment_args.hf_dataset_cache_dir, split=get_test_split(experiment_args.aspect, experiment_args.cv_fold)) docs_ds = load_dataset(get_local_hf_dataset_path( experiment_args.hf_dataset), name='docs', cache_dir=experiment_args.hf_dataset_cache_dir, split=datasets.Split('docs')) # Forced limit if experiment_args.dataset_limit > 0: logger.info( f'Train and test datasets limited to {experiment_args.dataset_limit} samples' ) train_ds = Dataset(train_ds.data[:experiment_args.dataset_limit]) test_ds = Dataset(test_ds.data[:experiment_args.dataset_limit]) # Build ID => Doc mapping doc_id2doc = {doc[experiment_args.doc_id_col]: doc for doc in docs_ds} if model_args.model_name_or_path.startswith('baseline-rnn'): # Load Spacy as tokenizer spacy_nlp = spacy.load(experiment_args.spacy_model, disable=["tagger", "ner", "textcat"]) if experiment_args.multi_label: # Baseline models model = RNNForMultiLabelSequenceClassification( word_vectors=get_vectors_from_spacy_model(spacy_nlp), hidden_size=experiment_args.rnn_hidden_size, rnn=experiment_args.rnn_type, num_labels=num_labels, num_layers=experiment_args.rnn_num_layers, dropout=experiment_args.rnn_dropout, ) else: raise NotImplementedError( 'RNN baseline is only available for multi label classification' ) tokenizer = None else: # Load pretrained Transformers models and tokenizers model_config = AutoConfig.from_pretrained( model_args.model_name_or_path, num_labels=num_labels, cache_dir=model_args.cache_dir) # No need for spacy spacy_nlp = None if 'longformer' in model_args.model_name_or_path: # TVM: a custom CUDA kernel implementation of our sliding window attention (works only on GPU) model_config.attention_mode = 'tvm' # override tokenizer name if not set if model_args.tokenizer_name is None: roberta_path = os.path.join(env['bert_dir'], 'roberta-base') model_args.tokenizer_name = roberta_path if os.path.exists( roberta_path) else 'roberta-base' logger.info( f'Overriding tokenizer: {model_args.tokenizer_name}') # override max length experiment_args.max_length = 4096 if experiment_args.multi_label: model_cls = AutoModelForMultiLabelSequenceClassification else: model_cls = AutoModelForSequenceClassification model = model_cls.from_pretrained(model_args.model_name_or_path, config=model_config, cache_dir=model_args.cache_dir) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) # Set token limit if defined by model (for Longformer) if model.config.max_position_embeddings > 0: tokenizer.model_max_length = model.config.max_position_embeddings # Init helper dpt = DocRelTrainerHelper( id2doc=doc_id2doc, transformers_tokenizer=tokenizer, spacy_nlp=spacy_nlp, label_classes=label_classes, binary_classification=experiment_args.binary_classification, doc_a_col=experiment_args.doc_a_col, doc_b_col=experiment_args.doc_b_col, label_col=experiment_args.label_col, text_from_doc_func=get_non_empty_text_from_doc, classification_threshold=experiment_args.classification_threshold, max_length=experiment_args.max_length, multi_label=experiment_args.multi_label, ) logger.info('Converting to features (doc mapping, tokenize, ...)') # Build hash from settings for caching data_settings_hash = hashlib.md5( dataclasses.asdict(experiment_args).__str__().encode("utf-8") + dataclasses.asdict(model_args).__str__().encode("utf-8")).hexdigest() train_tensor_ds = train_ds.map( dpt.convert_to_features, batched=True, load_from_cache_file=True, num_proc=int(env['workers']), cache_file_name=os.path.join( experiment_args.hf_dataset_cache_dir, "cache-train-" + data_settings_hash + ".arrow")) train_tensor_ds.set_format(type='torch', columns=columns) test_tensor_ds = test_ds.map( dpt.convert_to_features, batched=True, load_from_cache_file=True, num_proc=int(env['workers']), cache_file_name=os.path.join( experiment_args.hf_dataset_cache_dir, "cache-test-" + data_settings_hash + ".arrow")) test_tensor_ds.set_format(type='torch', columns=columns) logger.info(f'Dataset columns: {columns}') logger.info(f'Train sample: {train_ds[0]}') logger.debug(f'- as tensor: {train_tensor_ds[0]}') logger.info(f'Test sample: {test_ds[0]}') logger.debug(f'- as tensor: {test_tensor_ds[0]}') # Load models weights (when no training but predictions) model_weights_path = os.path.join(training_args.output_dir, 'pytorch_model.bin') if not training_args.do_train and experiment_args.save_predictions: logger.info( f'Loading existing model weights from disk: {model_weights_path}') if os.path.exists(model_weights_path): model.load_state_dict(torch.load(model_weights_path)) else: logger.error('Weights files does not exist!') # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_tensor_ds, eval_dataset=test_tensor_ds, data_collator=DocRelDataCollator(), #prediction_loss_only=False, compute_metrics=dpt.compute_metrics, ) # Log additional (to Weights & Baises) if is_wandb_available(): extra_config = {} extra_config.update(dataclasses.asdict(experiment_args)) extra_config.update(dataclasses.asdict(model_args)) wandb.config.update(extra_config, allow_val_change=True) if training_args.do_train: logger.info('Training started...') trainer.train() if isinstance(model, PreTrainedModel): trainer.save_model() tokenizer.save_pretrained(training_args.output_dir) elif isinstance(model, nn.Module): # RNN model torch.save(model.state_dict(), model_weights_path) if experiment_args.save_predictions: logger.info('Predicting...') predictions = trainer.predict(test_tensor_ds) df = dpt.get_df_from_predictions(test_ds, docs_ds, predictions, exclude_columns=['abstract']) # Save results to disk df.to_csv(os.path.join(training_args.output_dir, 'results.csv'), index=False) json.dump( predictions.metrics, open(os.path.join(training_args.output_dir, 'metrics.json'), 'w')) logger.info('Done')
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.env = get_env() self.data_dir = data_dir = os.path.join(self.env['datasets_dir'], 'acl-anthology')