def __init__( self, model_name_or_path: str = 'albert-large-uncased', # './vocab.txt' datasets_loader: str = 'race', # 'RACELocalLoader.py' task_name: str = 'all', max_seq_length: int = 512, train_batch_size: int = 32, eval_batch_size: int = 32, num_workers: int = 8, num_preprocess_processes: int = 8, use_sentence_selection: bool = True, best_k_sentences: int = 5, **kwargs): super().__init__() self.model_name_or_path = model_name_or_path self.dataset_loader = datasets_loader self.task_name = task_name self.max_seq_length = max_seq_length self.train_batch_size = train_batch_size self.eval_batch_size = eval_batch_size self.num_workers = num_workers self.num_preprocess_processes = num_preprocess_processes self.use_sentence_selection = use_sentence_selection self.best_k_sentences = best_k_sentences self.tokenizer = AlbertTokenizerFast.from_pretrained( self.model_name_or_path, use_fast=True, do_lower_case=True) self.scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2'], use_stemmer=True) self.dataset = None
def init_albert(): albert_max_len = 512 albert_path = "albert_base_turkish_uncased/" albert_model_name = "loodos-albert-base-turkish-uncased_seqlen512_batch64_epochs10/" albert_tokenizer = AlbertTokenizerFast.from_pretrained(albert_path, do_lower_case=False, keep_accents=True) albert_model_class = Model(albert_max_len, albert_path, albert_model_name, albert_tokenizer, "albert") print("3. ALBERT LOADED") return albert_model_class
new_examples = defaultdict(list) for text in texts: instances = create_instances_from_document(tokenizer, text, max_seq_length=512) for instance in instances: for key, value in instance.items(): new_examples[key].append(value) return new_examples if __name__ == '__main__': random.seed(0) nltk.download('punkt') tokenizer = AlbertTokenizerFast.from_pretrained('albert-large-v2') wikitext = load_dataset('wikitext', 'wikitext-103-v1', cache_dir='./data/cache') tokenized_datasets = wikitext.map( partial(tokenize_function, tokenizer), batched=True, num_proc=cpu_count(), remove_columns=["text"], ) tokenized_datasets.save_to_disk('./data/albert_tokenized_wikitext') tokenizer.save_pretrained('./data/tokenizer')
def fillmask(tweet): tokenizer = AlbertTokenizerFast.from_pretrained(save_directory) model = AlbertModel.from_pretrained("albert-base-v2") # With Bert #tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') #model = BertModel.from_pretrained("bert-base-uncased") # fonction that find words in common in the tweet and in the hateful words database def words_in_string(word_list, a_string): return set(word_list).intersection(a_string.split()) # import the hateful words database text_file = open("./model/fill-masking/hate_words.txt", "r") lines = text_file.readlines() lines = [item.replace("\n", "") for item in lines] # tweet input tweet = tweet + " !!!" tweet = tweet tweet_splited = re.findall(r"[\w']+|[.,!?;]", tweet) tweet = " ".join(list(map(str, tweet_splited))) words = [] # reset result result = [] # apply the function for word in words_in_string(lines, tweet): words.append(word) if len(words) > 0: if len(words) == 1: tweet = tweet.replace(word, "[MASK]") # train the model encoded_input = tokenizer(tweet, return_tensors='pt') output = model(**encoded_input) # replace hateful words unmasker = pipeline('fill-mask', model='bert-base-uncased') res = [] for dict in unmasker(tweet): res.append(dict["token_str"]) score = [af.score(word) for word in res] top_words = sorted(range(len(score)), key=lambda i: score[i], reverse=True)[:3] top_3 = [res[i] for i in top_words] result = [word, top_3] else: sentiment_scores = [af.score(word) for word in words] worst = words[sentiment_scores.index(min(sentiment_scores))] tweet = tweet.replace( words[sentiment_scores.index(min(sentiment_scores))], "[MASK]") # train the model encoded_input = tokenizer(tweet, return_tensors='pt') output = model(**encoded_input) # replace hateful words unmasker = pipeline('fill-mask', model='bert-base-uncased') res = [] for dict in unmasker(tweet): res.append(dict["token_str"]) score = [af.score(word) for word in res] top_words = sorted(range(len(score)), key=lambda i: score[i], reverse=True)[:3] top_3 = [res[i] for i in top_words] result = [worst, top_3] else: return "" return result
#from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, BertTokenizer, BertModel from transformers import AlbertTokenizerFast, AlbertForSequenceClassification, AlbertModel from transformers import pipeline from scipy.special import softmax from afinn import Afinn af = Afinn() import pandas as pd import numpy as np import torch import re import sentencepiece save_directory = './model/classification/AlBERT' tokenizer = AlbertTokenizerFast.from_pretrained(save_directory) model = AlbertForSequenceClassification.from_pretrained(save_directory) ################## # CLASSIFICATION # ################## def predict(tweet): # Tokenize Tweet encoded_input = tokenizer.encode(tweet, truncation=True, padding=True, return_tensors="pt") # Predict Tweet Classes output = model(encoded_input)
def load(cls, pretrained_model_name_or_path, revision=None, tokenizer_class=None, use_fast=True, **kwargs): """ Enables loading of different Tokenizer classes with a uniform interface. Either infer the class from model config or define it manually via `tokenizer_class`. :param pretrained_model_name_or_path: The path of the saved pretrained model or its name (e.g. `bert-base-uncased`) :type pretrained_model_name_or_path: str :param revision: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash. :type revision: str :param tokenizer_class: (Optional) Name of the tokenizer class to load (e.g. `BertTokenizer`) :type tokenizer_class: str :param use_fast: (Optional, False by default) Indicate if FARM should try to load the fast version of the tokenizer (True) or use the Python one (False). Only DistilBERT, BERT and Electra fast tokenizers are supported. :type use_fast: bool :param kwargs: :return: Tokenizer """ pretrained_model_name_or_path = str(pretrained_model_name_or_path) kwargs["revision"] = revision if tokenizer_class is None: tokenizer_class = cls._infer_tokenizer_class( pretrained_model_name_or_path) logger.info(f"Loading tokenizer of type '{tokenizer_class}'") # return appropriate tokenizer object ret = None if "AlbertTokenizer" in tokenizer_class: if use_fast: ret = AlbertTokenizerFast.from_pretrained( pretrained_model_name_or_path, keep_accents=True, **kwargs) else: ret = AlbertTokenizer.from_pretrained( pretrained_model_name_or_path, keep_accents=True, **kwargs) elif "XLMRobertaTokenizer" in tokenizer_class: if use_fast: ret = XLMRobertaTokenizerFast.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = XLMRobertaTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif "RobertaTokenizer" in tokenizer_class: if use_fast: ret = RobertaTokenizerFast.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = RobertaTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif "DistilBertTokenizer" in tokenizer_class: if use_fast: ret = DistilBertTokenizerFast.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = DistilBertTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif "BertTokenizer" in tokenizer_class: if use_fast: ret = BertTokenizerFast.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = BertTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif "XLNetTokenizer" in tokenizer_class: if use_fast: ret = XLNetTokenizerFast.from_pretrained( pretrained_model_name_or_path, keep_accents=True, **kwargs) else: ret = XLNetTokenizer.from_pretrained( pretrained_model_name_or_path, keep_accents=True, **kwargs) elif "ElectraTokenizer" in tokenizer_class: if use_fast: ret = ElectraTokenizerFast.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = ElectraTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "EmbeddingTokenizer": if use_fast: logger.error( 'EmbeddingTokenizerFast is not supported! Using EmbeddingTokenizer instead.' ) ret = EmbeddingTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = EmbeddingTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif "CamembertTokenizer" in tokenizer_class: if use_fast: ret = CamembertTokenizerFast.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = CamembertTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif "DPRQuestionEncoderTokenizer" in tokenizer_class: if use_fast: ret = DPRQuestionEncoderTokenizerFast.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = DPRQuestionEncoderTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif "DPRContextEncoderTokenizer" in tokenizer_class: if use_fast: ret = DPRContextEncoderTokenizerFast.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = DPRContextEncoderTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) if ret is None: raise Exception("Unable to load tokenizer") else: return ret
def preprocess(tokenizer: AlbertTokenizerFast, scorer: rouge_scorer, max_seq_length: int, use_sentence_selection: bool, best_k_sentences: int, x: Dict) -> Dict: choices_features = [] label_map = {"A": 0, "B": 1, "C": 2, "D": 3} question = x["question"] article = x['article'] if use_sentence_selection: qa = [question + option for option in x["options"]] # question_tokens = np.array(tokenizer(qa, add_special_tokens=False, truncation=True, max_length=25, # padding='max_length')['input_ids']) # question_tokens = np.array(tokenizer(qa, add_special_tokens=False)['input_ids']) sentences = article.split('.') sentences = [s for s in sentences if s != ''] # sentences_tokens = np.array(tokenizer(sentences, add_special_tokens=False, truncation=True, max_length=25, # padding='max_length')['input_ids']) # sentences_tokens = np.array(tokenizer(sentences, add_special_tokens=False)['input_ids']) question_len = len(qa) sentences_len = len(sentences) sentence_scores = np.empty((sentences_len, question_len)) for (i, j) in product(range(sentences_len), range(question_len)): scores = scorer.score(sentences[i], qa[j]) sentence_scores[i, j] = scores['rouge1'].precision + scores[ 'rouge2'].precision # sentence_scores = np.dot(sentences_tokens, question_tokens.T) / (np.linalg.norm( # sentences_tokens, axis=1).reshape(-1, 1) @ np.linalg.norm( # question_tokens, axis=1).reshape(1, -1)) max_sentence_score = np.max(sentence_scores, axis=1) best_sentence_indices = max_sentence_score.argsort( )[-best_k_sentences:][::-1] final_indices = set() for index in best_sentence_indices: final_indices.add(index - 1) final_indices.add(index) final_indices.add(index + 1) final_indices.discard(-1) final_indices.discard(sentences_len) article = '.'.join([sentences[i] for i in sorted(final_indices)]) question_len = len(tokenizer.tokenize(question)) option: str for option in x["options"]: question_option = question + option # if question.find("_") != -1: # # fill in the banks questions # question_option = question.replace("_", option) # else: # question_option = question + " [SEP] " + option option_len = len(tokenizer.tokenize(option)) inputs = tokenizer(article, question_option, add_special_tokens=True, max_length=max_seq_length, truncation=True, padding='max_length', return_tensors='pt') token_type_ids = np.array(inputs['token_type_ids']) inputs['article_len'] = int( np.where(token_type_ids == 1)[1][0]) - 2 # inputs['question_len'] = question_len inputs['option_len'] = option_len choices_features.append(inputs) labels = label_map.get(x["answer"], -1) label = torch.tensor(labels).long() return { "label": label, "input_ids": torch.cat([cf["input_ids"] for cf in choices_features]).reshape(-1), "attention_mask": torch.cat([cf["attention_mask"] for cf in choices_features]).reshape(-1), "token_type_ids": torch.cat([cf["token_type_ids"] for cf in choices_features]).reshape(-1), "article_len": torch.tensor([cf["article_len"] for cf in choices_features]).long(), "question_len": torch.tensor([question_len] * 4).long(), # "question_len": torch.Tensor([cf["question_len"] for cf in choices_features]), "option_len": torch.tensor([cf["option_len"] for cf in choices_features]).long(), }
def prepare_data(self): datasets.load_dataset(self.dataset_loader, self.task_name) AlbertTokenizerFast.from_pretrained(self.model_name_or_path, use_fast=True)
with open('data/bypublisher/articles.txt') as f: articles = f.readlines() articles = [x.strip().lower() for x in articles] with open('data/bypublisher/labels.txt') as f: labels = f.readlines() labels = [int(x.strip()) for x in labels] train_texts, other_texts, train_labels, other_labels = train_test_split( articles, labels, test_size=.2) val_texts = other_texts[:len(other_texts) // 2] val_labels = other_labels[:len(other_labels) // 2] tokenizer = AlbertTokenizerFast.from_pretrained("albert-base-v1") train_encodings = tokenizer(train_texts, truncation=True, padding='max_length') val_encodings = tokenizer(val_texts, truncation=True, padding='max_length') test_encodings = tokenizer(test_texts, truncation=True, padding='max_length') class HyperpartisanshipDataset(torch.utils.data.Dataset): def __init__(self, encodings, labels): self.encodings = encodings self.labels = labels def __getitem__(self, idx): item = { key: torch.tensor(val[idx]) for key, val in self.encodings.items()
def main(): parser = HfArgumentParser( (AlbertTrainingArguments, DatasetArguments, CollaborationArguments)) training_args, dataset_args, collaboration_args = parser.parse_args_into_dataclasses( ) logger.info( f"Found {len(collaboration_args.initial_peers)} initial peers: {collaboration_args.initial_peers}" ) if len(collaboration_args.initial_peers) == 0: raise ValueError( "Please specify at least one network endpoint in initial peers.") collaboration_args_dict = asdict(collaboration_args) setup_logging(training_args) # Set seed before initializing model. set_seed(training_args.seed) config = AlbertConfig.from_pretrained(dataset_args.config_path, cache_dir=dataset_args.cache_dir) tokenizer = AlbertTokenizerFast.from_pretrained( dataset_args.tokenizer_path, cache_dir=dataset_args.cache_dir) model = get_model(training_args, config, tokenizer) model.to(training_args.device) tokenized_datasets = load_from_disk(Path(dataset_args.dataset_path)) # This data collator will take care of randomly masking the tokens. data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer) opt, scheduler = get_optimizer_and_scheduler(training_args, model) validators, local_public_key = metrics_utils.make_validators( collaboration_args_dict['experiment_prefix']) dht = hivemind.DHT( start=True, initial_peers=collaboration_args_dict.pop('initial_peers'), listen=not collaboration_args_dict['client_mode'], listen_on=collaboration_args_dict.pop('dht_listen_on'), endpoint=collaboration_args_dict.pop('endpoint'), record_validators=validators) total_batch_size_per_step = training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps statistics_expiration = collaboration_args_dict.pop( 'statistics_expiration') adjusted_target_batch_size = collaboration_args_dict.pop('target_batch_size') \ - collaboration_args_dict.pop('batch_size_lead') collaborative_optimizer = hivemind.CollaborativeOptimizer( opt=opt, dht=dht, scheduler=scheduler, prefix=collaboration_args_dict.pop('experiment_prefix'), compression_type=hivemind.utils.CompressionType.Value( collaboration_args_dict.pop('compression')), batch_size_per_step=total_batch_size_per_step, throughput=collaboration_args_dict.pop('bandwidth'), target_batch_size=adjusted_target_batch_size, client_mode=collaboration_args_dict.pop('client_mode'), verbose=True, start=True, **collaboration_args_dict) class TrainerWithIndependentShuffling(Trainer): def get_train_dataloader(self) -> DataLoader: """ Shuffle data independently for each peer to avoid duplicating batches [important for quality] """ torch.manual_seed(hash(local_public_key)) return super().get_train_dataloader() trainer = TrainerWithIndependentShuffling( model=model, args=training_args, tokenizer=tokenizer, data_collator=data_collator, train_dataset=tokenized_datasets["train"] if training_args.do_train else None, eval_dataset=tokenized_datasets["validation"] if training_args.do_eval else None, optimizers=(collaborative_optimizer, NoOpScheduler(collaborative_optimizer)), callbacks=[ CollaborativeCallback(dht, collaborative_optimizer, model, local_public_key, statistics_expiration) ]) trainer.remove_callback(transformers.trainer_callback.PrinterCallback) trainer.remove_callback(transformers.trainer_callback.ProgressCallback) # Training if training_args.do_train: latest_checkpoint_dir = max(Path( training_args.output_dir).glob('checkpoint*'), default=None, key=os.path.getctime) trainer.train(model_path=latest_checkpoint_dir)
def main(): parser = HfArgumentParser((AlbertTrainingArguments, DatasetArguments, CollaborationArguments)) training_args, dataset_args, collaboration_args = parser.parse_args_into_dataclasses() # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN, ) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info("Training/evaluation parameters %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) config = AlbertConfig.from_pretrained(dataset_args.config_path, cache_dir=dataset_args.cache_dir) tokenizer = AlbertTokenizerFast.from_pretrained(dataset_args.tokenizer_path, cache_dir=dataset_args.cache_dir) # find latest checkpoint in output_dir output_dir = Path(training_args.output_dir) logger.info(f'Checkpoint dir {output_dir}, contents {list(output_dir.glob("checkpoint*"))}') latest_checkpoint_dir = max(output_dir.glob('checkpoint*'), default=None, key=os.path.getctime) if latest_checkpoint_dir is not None: logger.info(f'Loading model from {latest_checkpoint_dir}') model = AlbertForPreTraining.from_pretrained(latest_checkpoint_dir) else: logger.info(f'Training from scratch') model = AlbertForPreTraining(config) model.resize_token_embeddings(len(tokenizer)) tokenized_dataset_path = Path(dataset_args.dataset_path) tokenized_datasets = load_from_disk(tokenized_dataset_path) # Data collator # This one will take care of randomly masking the tokens. data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": training_args.weight_decay, }, { "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] optimizer = FusedLAMB( optimizer_grouped_parameters, lr=training_args.learning_rate, betas=(training_args.adam_beta1, training_args.adam_beta2), eps=training_args.adam_epsilon, ) lr_scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=training_args.warmup_steps, num_training_steps=training_args.max_steps ) trainer = CollaborativeTrainer( model=model, args=training_args, collaboration_args=collaboration_args, train_dataset=tokenized_datasets["train"] if training_args.do_train else None, eval_dataset=tokenized_datasets["validation"] if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, optimizers=(optimizer, lr_scheduler) ) # Training if training_args.do_train: trainer.train(model_path=latest_checkpoint_dir)