def create_pseudo_labeled_data(args, infer_input, infer_output, eval_result, id2label, next_data_dir): """Create pseudeo labeled data for the next self-training iteration.""" dataset = datasets.concatenate_datasets([infer_input, infer_output], axis=1) if args.do_filter_by_confidence: dataset = dataset.filter( lambda example: example['probability'] > args.confidence_threshold) if args.do_filter_by_val_performance: assert eval_result >= 0.0 and eval_result <= 1.0 num_selected_rows = int(eval_result * len(dataset)) print(num_selected_rows) dataset = dataset.sort('probability', reverse=True) dataset = dataset.select(range(num_selected_rows)) dataset = dataset.remove_columns(['label', 'probability']) dataset = dataset.rename_column('prediction', 'label') dataset = dataset.map( lambda example: {'label': id2label[example['label']]}) dataset = dataset.shuffle(seed=args.seed) pseudo_labeled_data_file = os.path.join( next_data_dir, f'train_pseudo.{args.data_file_extension}') if args.data_file_extension == 'csv': dataset.to_csv(pseudo_labeled_data_file, index=False) else: dataset.to_json(pseudo_labeled_data_file)
def concatenate_datasets_with_ratio(args, train_dataset): concatenate_list = [] for sub_dataset_name, ratio in zip( args.data.sub_datasets.split(","), args.data.sub_datasets_ratio.split(",")): ratio = float(ratio) sub_dataset_path = p.join(args.path.train_data_dir, sub_dataset_name) assert p.exists(sub_dataset_path), f"{sub_dataset_name}이 존재하지 않습니다." sub_dataset = load_from_disk(sub_dataset_path) sub_dataset_len = int(len(sub_dataset["train"]) * ratio) print(f"ADD SUB DATASET {sub_dataset_name}, LENGTH: {sub_dataset_len}") # sub dataset must have same features: ['id', 'title', 'context', 'question', 'answers'] features = sub_dataset["train"].features new_sub_dataset = sub_dataset["train"].select(range(sub_dataset_len)) new_sub_dataset = Dataset.from_pandas(new_sub_dataset.to_pandas(), features=features) concatenate_list.append(new_sub_dataset.flatten_indices()) train_dataset = Dataset.from_pandas(train_dataset.to_pandas(), features=features) train_dataset = concatenate_datasets([train_dataset.flatten_indices()] + concatenate_list) return train_dataset
def concate(dataset_name, data, cache_dir): if dataset_name in dataset_types: all_datasets_downloaded = [ load_dataset(dataset_name, sub_dataset, cache_dir=cache_dir) for sub_dataset in dataset_types[dataset_name] ] combined_datasets = [ concatenate_datasets(list(sub_dataset.values())) for sub_dataset in all_datasets_downloaded ] data = concatenate_datasets(combined_datasets) return DatasetDict({"train": data}) data = concatenate_datasets( list(load_dataset(dataset_name, cache_dir=cache_dir).values()) ) return DatasetDict({"train": data})
def download_dset(c, hf_tokenizer, cache_dir, num_proc): dsets = [] ELECTRAProcessor = partial(ELECTRADataProcessor, hf_tokenizer=hf_tokenizer, max_length=c.max_length) # Wikipedia if 'wikipedia' in c.datas: print('load/download wiki dataset') wiki = datasets.load_dataset('wikipedia', '20200501.en', cache_dir=cache_dir)['train'] print('load/create data from wiki dataset for ELECTRA') e_wiki = ELECTRAProcessor(wiki).map( cache_file_name=f"1000_electra_wiki_{c.max_length}.arrow", num_proc=num_proc) dsets.append(e_wiki) # OpenWebText if 'openwebtext' in c.datas: print('load/download OpenWebText Corpus') owt = datasets.load_dataset('openwebtext', cache_dir=cache_dir)['train'] print('load/create data from OpenWebText Corpus for ELECTRA') e_owt = ELECTRAProcessor(owt, apply_cleaning=False).map( cache_file_name=f"electra_owt_{c.max_length}.arrow", num_proc=num_proc) dsets.append(e_owt) assert len(dsets) == len(c.datas) train_dset = datasets.concatenate_datasets(dsets) return train_dset
def step_3(dataset, args): """ Balance positive and negative samples by randomly removing examples from the larger-sized class """ print('In Step 3') unq_ele_and_cts = Counter(dataset['label']) assert len(unq_ele_and_cts) == 2, 'There should only be two unique labels' key_with_max_val = max(unq_ele_and_cts, key=unq_ele_and_cts.get) key_with_min_val = min(unq_ele_and_cts, key=unq_ele_and_cts.get) extra_count = abs(unq_ele_and_cts[key_with_max_val] - unq_ele_and_cts[key_with_min_val]) ## Divide dataset into two datasets based on their labels def filter_based_on_label(sample, idx, label): return label == sample['label'] key_with_data = defaultdict() for key in unq_ele_and_cts.keys(): key_with_data[key] = dataset.filter(filter_based_on_label, fn_kwargs={'label': key}, with_indices=True) ## Remove extra counts from class with more elements def filter_extra_samples_from_key_with_max_val(sample, idx, extra_count): return False if idx < extra_count else True key_with_data[key_with_max_val] = key_with_data[key_with_max_val].shuffle( seed=args.seed).filter(filter_extra_samples_from_key_with_max_val, fn_kwargs={'extra_count': extra_count}, with_indices=True) ## Combine the two datasets return concatenate_datasets(list(key_with_data.values())).sort('idx')
def make_kor_dataset_v1(args): """KorQuad Dataset V1 1. 문서 길이 512이하 Filtering 2. Context당 Question 최대 4개 3. ans_start 위치로 8000개 샘플링 """ kor_dataset_path = p.join(args.path.train_data_dir, "kor_dataset") if p.exists(kor_dataset_path): raise FileExistsError(f"{kor_dataset_path}는 이미 존재하는 파일입니다!") kor_dataset = load_dataset("squad_kor_v1") kor_dataset = concatenate_datasets([ kor_dataset["train"].flatten_indices(), kor_dataset["validation"].flatten_indices() ]) # (1) 문서 길이: KLUE MRC 512가 최소 길이 kor_dataset = filtering_by_doc_len(kor_dataset, doc_len=512) # (2) 중복 Context 제거: Context당 최대 4개의 질문 kor_dataset = filtering_by_dup_question(kor_dataset, dup_limit=4) # (3) KOR answer_start Weight Sampling 2배수 사용 kor_dataset = sampling_by_ans_start_weights(kor_dataset, sample=8000) # (4) KOR_DATASET만 저장 kor_datasets = DatasetDict({"train": kor_dataset}) kor_datasets.save_to_disk(kor_dataset_path) print(f"{kor_dataset_path}에 저장되었습니다!")
def load_and_concatenate_datasets(data_args): """Load and concatenate multiple compatible datasets""" train_datasets, validation_datasets = [], [] for name, config in zip(data_args.dataset_name, data_args.dataset_config_name): dataset = load_dataset(name, config) if "validation" not in dataset.keys(): validation_ds = load_dataset( name, config, split=f"train[:{data_args.validation_split_percentage}%]", ) train_ds = load_dataset( name, config, split=f"train[{data_args.validation_split_percentage}%:]", ) else: validation_ds = dataset["validation"] train_ds = dataset["train"] # Some specific preprocessing to align fields on known datasets # extraneous fields not used in language modeling are also removed # after preprocessing if name == "wikipedia": train_ds.remove_columns_("title") validation_ds.remove_columns_("title") elif name == "ptb_text_only": train_ds.rename_column_("sentence", "text") validation_ds.rename_column_("sentence", "text") train_datasets.append(train_ds) validation_datasets.append(validation_ds) for ds_idx in range(1, len(train_datasets)): assert train_datasets[ds_idx].features.type == \ train_datasets[ds_idx - 1].features.type, \ "Features name and type must match between all datasets" datasets = DatasetDict() datasets["train"] = concatenate_datasets(train_datasets) datasets["validation"] = concatenate_datasets(validation_datasets) return datasets
def from_datasets(cls, dataset_list: List["Dataset"]) -> "Dataset": """Create a single Dataset by concatenating a list of datasets Parameters ---------- dataset_list Datasets to be concatenated. They must have the same column types. Returns ------- dataset """ return cls(datasets.concatenate_datasets([ds.dataset for ds in dataset_list]))
def test_dataset_concatenate_audio_features(shared_datadir): # we use a different data structure between 1 and 2 to make sure they are compatible with each other audio_path = str(shared_datadir / "test_audio_44100.wav") data1 = {"audio": [audio_path]} dset1 = Dataset.from_dict(data1, features=Features({"audio": Audio()})) data2 = {"audio": [{"bytes": open(audio_path, "rb").read()}]} dset2 = Dataset.from_dict(data2, features=Features({"audio": Audio()})) concatenated_dataset = concatenate_datasets([dset1, dset2]) assert len(concatenated_dataset) == len(dset1) + len(dset2) assert concatenated_dataset[0]["audio"]["array"].shape == dset1[0][ "audio"]["array"].shape assert concatenated_dataset[1]["audio"]["array"].shape == dset2[0][ "audio"]["array"].shape
def add_index(shard_dir, index_path): data_shard_list = [] for shard_address in glob(str(shard_dir) + "/*/"): data_shard_list.append(load_from_disk(shard_address)) concat = concatenate_datasets(data_shard_list) faiss.omp_set_num_threads(96) index = faiss.IndexHNSWFlat(768, 128, faiss.METRIC_INNER_PRODUCT) concat.add_faiss_index("embeddings", custom_index=index) concat.get_index("embeddings").save( index_path ) # since we load the index in to memory,we can directly update the index in the disk
def extend_length(self, length): """ Extends the length of the dataset by randomly repeating length amount of rows. """ len_ = len(self) if length < len_: raise('Should not decrease the length of dataset') rand_indices = np.random.randint(len_, size=length - len_) columns = self.dataset.format['columns'] additional_data = self.dataset.select(rand_indices) self.dataset = concatenate_datasets([self.dataset, additional_data]) self.dataset.set_format(type=self.dataset.format["type"], columns=columns)
def process_hf_dataset(self, training_ds: Dataset, validation_ds: Optional[Dataset] = None): ds = training_ds # concatenate the validation dataset if it is included if validation_ds is not None: # add an "is_valid_col" column to both training/validation DataFrames to indicate what data is part of # the validation set if self.is_valid_attr: validation_ds = validation_ds.add_column( self.is_valid_attr, [True] * len(validation_ds)) training_ds = training_ds.add_column( self.is_valid_attr, [False] * len(training_ds)) ds = concatenate_datasets([training_ds, validation_ds]) return ds
def create_dataloader(args, dataset, tokenizer, k_shot=False, num_classes=None): """ Function to create a PyTorch Dataloader from a given dataset. Inputs: args - Namespace object from the argument parser dataset - Dataset to convert to Dataloader tokenizer - BERT tokenizer instance k_shot - Indicates whether to make the training set k-shot. Default is False num_classes - Number of classes in the dataset. Default is None Outputs: dataset - DataLoader object of the dataset """ # check if k-shot new_dataset = [] if k_shot: for current_class in range(0, num_classes): class_set = dataset.filter( lambda example: example['labels'] == current_class) class_set = class_set.shuffle() class_set = class_set.filter(lambda e, i: i < args.k, with_indices=True) new_dataset.append(class_set) dataset = concatenate_datasets(new_dataset) # create a data collator function data_collator = DataCollatorWithPadding(tokenizer) # create the dataloader dataset = DataLoader( dataset, batch_size=args.batch_size, collate_fn=data_collator, drop_last=False, shuffle=True, ) # return the dataset return dataset
def load_qa_dataset(examples_dirpath, scratch_dirpath, clean_model_filepaths=None, more_clean_data=False): clean_fns = [] if more_clean_data: for model_type_paths in clean_model_filepaths.values(): clean_examples_dirpath_list = ['/'.join(v.split('/')[:-1]+['example_data']) for v in model_type_paths] for dirpath in clean_examples_dirpath_list: clean_fns += [os.path.join(dirpath, fn) for fn in os.listdir(dirpath) if (fn.endswith('.json') and 'clean'in fn)] fns = [os.path.join(examples_dirpath, fn) for fn in os.listdir(examples_dirpath) if (fn.endswith('.json') and 'clean'in fn)] fns.sort() examples_filepath_list = fns + clean_fns dataset_list = [] for examples_filepath in examples_filepath_list: # Load the examples dataset_list.append(datasets.load_dataset('json', data_files=[examples_filepath], field='data', keep_in_memory=True, split='train', cache_dir=os.path.join(scratch_dirpath, '.cache'))) return datasets.concatenate_datasets(dataset_list)
def _exec_logistic_regression(self): datasets = load_from_disk( p.join(self.args.path.train_data_dir, "train_dataset")) train_dataset = concatenate_datasets( [datasets["train"], datasets["validation"]]) queries = train_dataset["question"] doc_scores, doc_indices = self.sparse_retriever.get_relevant_doc_bulk( queries, topk=8) doc_scores, doc_indices = np.array(doc_scores), np.array(doc_indices) contexts = np.array(self.sparse_retriever.contexts) train_x, train_y = [], [] for idx in tqdm.tqdm(range(len(doc_scores))): doc_index = doc_indices[idx] org_context = train_dataset["context"][idx] feature_vector = [ doc_scores[idx][:pow(2, i)] for i in range(1, self.num_features + 1) ] feature_vector = list(map(lambda x: x.mean(), feature_vector)) feature_vector = softmax(feature_vector) label = 0 y = -1 if org_context in contexts[doc_index]: y = list(contexts[doc_index]).index(org_context) if y != -1 and y < self.kbound: label = 1 train_x.append(feature_vector) train_y.append(label) logistic = LogisticRegression() logistic.fit(train_x, train_y) return logistic
def main(args): mrc_test_dataset = load_from_disk(p.join(args.path.train_data_dir, "test_dataset")) mrc_dummy_dataset = load_from_disk(p.join(args.path.train_data_dir, "dummy_dataset")) all_mrc_dummy_dataset = concatenate_datasets( [mrc_dummy_dataset["train"].flatten_indices(), mrc_dummy_dataset["validation"].flatten_indices()] ) cheat_ids = list(set(mrc_test_dataset["validation"]["id"]).intersection(set(all_mrc_dummy_dataset["id"]))) cheats = {} for cheat_id in cheat_ids: # ex) cheat_id: 'mrc-1-000711' temp = check_is_real_cheating(mrc_test_dataset, all_mrc_dummy_dataset, cheat_id) cheats[cheat_id] = temp cheat_path = p.join(args.path.train_data_dir, "cheat.json") print(cheats) with open(cheat_path, "w") as f: f.write(json.dumps(cheats, indent=4, ensure_ascii=False) + "\n")
def createDataset(config): """ build dataset from the h5 file also filter out rare *individual ATU* """ df = pd.read_hdf(config.data.h5_file, key=config.data.h5_key) atu = df.loc[df.groupby("atu")["atu"].filter( lambda g: len(g) >= config["datamodules"]["atu_filter_no"]).index] atu = atu[["text", "atu", "desc", "label"]] dataset = Dataset.from_pandas(atu) tokenizer = AutoTokenizer.from_pretrained(config["module"]["arch"]) def tokenize(instance): return tokenizer(instance["text"], max_length=config["module"]["seq_len"], truncation="longest_first", padding="max_length") dataset = dataset. \ shuffle(seed=config.seed). \ map(tokenize, batched=True) # split by cls (stratified) sub_ds = {"train": [], "test": []} for cls in np.unique(dataset["label"]): cls_ds = dataset. \ filter(lambda d: d['label'] == int(cls)) cls_ds = cls_ds.train_test_split(test_size=config.data.test_ratio, seed=config.seed) sub_ds["train"].append(cls_ds["train"]) sub_ds["test"].append(cls_ds["test"]) dataset = DatasetDict( {split: concatenate_datasets(ds) for split, ds in sub_ds.items()}) dataset.save_to_disk(config.data.cached_dir) return dataset
cols=TEXT_COLS[task], max_len=c.max_length) glue_dsets[task] = dsets.my_map( tok_func, cache_file_names=f"tokenized_{c.max_length}_{{split}}") if c.double_unordered and task in ["mrpc", "stsb"]: swap_tok_func = partial( tokenize_sents_max_len, cols=TEXT_COLS[task], max_len=c.max_length, swap=True, ) swapped_train = dsets["train"].my_map( swap_tok_func, cache_file_name=f"swapped_tokenized_{c.max_length}_train") glue_dsets[task]["train"] = datasets.concatenate_datasets( [glue_dsets[task]["train"], swapped_train]) # Load / Make dataloaders hf_dsets = HF_Datasets( glue_dsets[task], hf_toker=hf_tokenizer, n_inp=3, cols={ "inp_ids": TensorText, "attn_mask": noop, "token_type_ids": noop, "label": TensorCategory, }, ) if c.double_unordered and task in ["mrpc", "stsb"]: dl_kwargs = {
AutoTokenizer, HfArgumentParser, Trainer, TrainingArguments, default_data_collator, set_seed, ) wiki = load_dataset("wikipedia", "20200501.en", split="train") # len(wiki):6078422 bookcorpus = load_dataset("bookcorpus", split="train") # len(bookcorpus): 74004228 print(wiki.column_names, bookcorpus.column_names) # ['title', 'text'] ['text'] wiki.remove_columns_("title") bart_dataset = concatenate_datasets([wiki, bookcorpus]) # len(bart_dataset):80082650 tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large', use_fast=True) # Preprocessing the datasets. # First we tokenize all the texts. column_names = bart_dataset.column_names text_column_name = "text" if "text" in column_names else column_names[0] def tokenize_function(examples): return tokenizer(examples[text_column_name]) tokenized_datasets = bart_dataset.map( tokenize_function, batched=True, num_proc=16,
timit = load_dataset('timit_asr') def is_discardable(batch): for phoneme in batch["phonetic_detail"]["utterance"]: if phoneme in TIMIT_DISCARD: return False return True timit_filt = timit["train"].filter(lambda eg: is_discardable(eg)) timit_filt2 = timit["test"].filter(lambda eg: is_discardable(eg)) timit = concatenate_datasets([timit_filt, timit_filt2]) MAX_TOKENS = 1120000 BASE = timit[0]["file"].split("/data/")[0] + "/data/" with open("train_timit.tsv", "w") as manifest, open("train_timit.ltr", "w") as transcript: manifest.write(BASE + "\n") for item in timit: frames, sr = sf.read(item["file"]) manifest.write(f"{item['file'].replace(BASE, '')}\t{len(frames)}\n") utt = item['phonetic_detail']['utterance'] mapped = map_timit_to_cmudict(utt) transcript.write(f"{' '.join(mapped)}\n")
parser.add_argument("output_dir") parser.add_argument("--lr", default=3e-5, type=float) parser.add_argument("--epochs", default=3, type=int) parser.add_argument("--batch_size", default=128, type=int) parser.add_argument("--fp16", action="store_true") args = parser.parse_args() # Load dataset. TODO: Do this in prepare_smnli.py instead dataset_list = [ load_from_disk("data/mnli-tokenized"), load_from_disk("data/snli-tokenized"), ] train_dataset = concatenate_datasets([ Dataset.from_dict({ "attention_mask": dataset["train"]["attention_mask"], "input_ids": dataset["train"]["input_ids"], "token_type_ids": dataset["train"]["token_type_ids"], "label": dataset["train"]["label"], }) for dataset in dataset_list ]) def compute_metrics(p: EvalPrediction): preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions preds = np.argmax(preds, axis=1) return { "accuracy": (preds == p.label_ids).astype(np.float32).mean().item() } # Prepare and train model
def get_train_dataset(args): args.data.dataset_name = "train_dataset" datasets = get_dataset(args, is_train=True) datasets = concatenate_datasets( [datasets["train"], datasets["validation"]]) return datasets
doc_scores = [] doc_indices = [] for query in queries: ret0, ret1 = self.get_relevant_doc(query, k) doc_scores.append(ret0) doc_indices.append(ret1) return doc_scores, doc_indices if __name__ == "__main__": # Test sparse org_dataset = load_from_disk("data/train_dataset") full_ds = concatenate_datasets([ org_dataset["train"].flatten_indices(), org_dataset["validation"].flatten_indices(), ]) # train dev 를 합친 4192 개 질문에 대해 모두 테스트 print("*" * 40, "query dataset", "*" * 40) print(full_ds) mecab = Mecab() def tokenize(text): # return text.split(" ") return mecab.morphs(text) wiki_path = "wikipedia_documents.json" retriever = BM25Arti( # tokenize_fn=tokenizer.tokenize, tokenize_fn=tokenize, data_path="data",
if not os.path.exists(train_save_path): print("generating new, class-balanced datasets...") train_datasets = [] valid_datasets = [] for c in range(num_classes): class_dataset = dataset.filter( lambda example: example['label'] == c).shuffle() train_datasets.append( class_dataset.select(range(num_train_per_class))) if not os.path.exists(valid_save_path): valid_datasets.append( class_dataset.select( range(num_train_per_class, num_valid_per_class + num_train_per_class))) train_dataset = concatenate_datasets(train_datasets).shuffle() train_dataset.save_to_disk(train_save_path) if not os.path.exists(valid_save_path): valid_dataset = concatenate_datasets(valid_datasets).shuffle() valid_dataset.save_to_disk(valid_save_path) else: print("loading {}...".format(train_save_path)) # abridged, class-balanced dataset already exists, so just load it train_dataset = load_from_disk(train_save_path) for t in ts: tran_train_save_path = os.path.join( data_dir, task, t, task + '_train_' + str(num_train_per_class))
dataset_448.save_to_disk("/home/ahemf/processed/c4_448") c4 = DatasetDict.load_from_disk("/home/ahemf/processed/c4_448") dsets = Dataset.load_from_disk("/home/ahemf/processed/dsets_448") c4['train'] = c4['train'].add_column('dataset', ['c4'] * len(c4['train'])) c4['train'] = c4['train'].remove_columns(['url', 'timestamp']) c4['validation'] = c4['validation'].remove_columns(['url', 'timestamp']) c4['validation'] = c4['validation'].add_column('dataset', ['c4'] * len(c4['validation'])) dataset_col = dsets['dataset'] dsets = dsets.remove_columns(["dataset"]) dsets = dsets.add_column("dataset", dataset_col) c4["train"] = concatenate_datasets([c4["train"], dsets]) c4["train"].save_to_disk("/home/ahemf/processed/c4_extended") c4 = Dataset.load_from_disk("/home/ahemf/processed/c4_extended") ################################################################### ## TF-IDF ################################################################### from collections import Counter from transformers import AutoTokenizer, AutoModel, RobertaTokenizerFast from datasets import load_dataset, concatenate_datasets, Dataset, DatasetDict import os import numpy as np import re from multiprocess.pool import Pool
def __get_all_train(self): all_data = concatenate_datasets([self.snli['train'], self.mnli['train']]) return self.__get_data_loader(all_data)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (BasicModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) checkpoint_dir = hyperparam_path_for_baseline(model_args, data_args, training_args) ckpt_dir = Path(checkpoint_dir) postfix = "" if training_args.do_train: postfix += "_train" if training_args.do_eval: postfix += "_eval" setup_root_logger(ckpt_dir, training_args.local_rank, debug=False, postfix=postfix) training_args.output_dir = checkpoint_dir # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() logger.info("Training/evaluation parameters %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). if not 0 <= data_args.holdout_set < data_args.n_fold: raise ValueError("Test fold must be in [0, n_fold)") if data_args.dataset not in ['race', 'dream']: raise ValueError("Dataset should be race or dream.") else: from mcmrc.data_utils.processors import prepare_features # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. if data_args.debug_mode: datasets = load_dataset(data_args.dataload_script, data_args.dataload_split, data_dir=data_args.data_dir, split={ 'train': ReadInstruction('train', from_=0, to=5, unit='abs'), 'validation': ReadInstruction('validation', from_=0, to=5, unit='abs'), 'test': ReadInstruction('test', from_=0, to=5, unit='abs') }) else: datasets = load_dataset(data_args.dataload_script, data_args.dataload_split, data_dir=data_args.data_dir) if data_args.shuffle_train_dataset: datasets['train'] = datasets['train'].shuffle(seed=training_args.seed) if data_args.split_train_dataset: holdout_set_start = int( len(datasets['train']) / data_args.n_fold * data_args.holdout_set) holdout_set_end = int( len(datasets['train']) / data_args.n_fold * (data_args.holdout_set + 1)) shuffled_train_set = datasets['train'].shuffle(seed=training_args.seed) if holdout_set_start == 0: new_train_set = Dataset.from_dict( shuffled_train_set[holdout_set_end:]) elif holdout_set_end == len(datasets['train']): new_train_set = Dataset.from_dict( shuffled_train_set[:holdout_set_start]) else: new_train_set = concatenate_datasets([ Dataset.from_dict(shuffled_train_set[:holdout_set_start]), Dataset.from_dict(shuffled_train_set[holdout_set_end:]) ]) new_holdout_set = Dataset.from_dict( shuffled_train_set[holdout_set_start:holdout_set_end]) assert new_train_set.num_rows + new_holdout_set.num_rows == shuffled_train_set.num_rows datasets['train'] = new_train_set datasets['holdout_set'] = new_holdout_set # Load pretrained model and tokenizer # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, ) model = AutoModelForMultipleChoice.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) if training_args.do_train: column_names = datasets["train"].column_names else: column_names = datasets["validation"].column_names pprepare_features = partial(prepare_features, tokenizer=tokenizer, data_args=data_args) tokenized_datasets = datasets.map( pprepare_features, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) # Data collator data_collator = (default_data_collator if data_args.pad_to_max_length else DataCollatorForMultipleChoice(tokenizer=tokenizer)) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"] if training_args.do_train else None, eval_dataset=tokenized_datasets["validation"] if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_mc_metrics, ) # Training if training_args.do_train: train_result = trainer.train() output_train_file = os.path.join(training_args.output_dir, "train_results.txt") with open(output_train_file, "w") as writer: logger.info("***** Train results *****") for key, value in sorted(train_result.metrics.items()): logger.info(f"{key} = {value:.3f}") writer.write(f"{key} = {value:.3f}\n") # Evaluation # To use the best checkpoint model at end, use the aruguments # load_best_model_at_end, metric_for_best_model, evaluation_strategy steps # --load_best_model_at_end \ # --metric_for_best_model accuracy \ # --evaluation_strategy steps \ if training_args.do_eval: if training_args.load_best_model_at_end: best_model = AutoModelForMultipleChoice.from_pretrained( training_args.output_dir, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) best_model = best_model.to(training_args.device) for split in [k for k in datasets.keys() if k != "train"]: logger.info(f"*** Evaluate {split} set ***") results = trainer.evaluate(tokenized_datasets[split]) if training_args.load_best_model_at_end: final_model = trainer.model trainer.model = best_model best_model_results = trainer.evaluate( tokenized_datasets[split]) trainer.model = final_model output_eval_file = os.path.join(training_args.output_dir, f"{split}_results.txt") with open(output_eval_file, "a+") as writer: logger.info("***** Extensive Eval results *****") if not training_args.do_train: writer.write( f"eval checkpoint {model_args.model_name_or_path}\n") for key, value in sorted(results.metrics.items()): logger.info(f"{key} = {value:.3f}") writer.write(f"{key} = {value:.3f}\n") if training_args.load_best_model_at_end: writer.write(f"best model on dev set\n") for key, value in sorted( best_model_results.metrics.items()): logger.info(f"{key} = {value:.3f}") writer.write(f"{key} = {value:.3f}\n") if data_args.output_prediction_file or data_args.split_train_dataset: prediction = { example_id: prediction.tolist() for prediction, label_id, example_id in zip(*results[:-1]) } if split == "holdout_set": output_prediction_file = os.path.join( training_args.output_dir, f"holdout_{data_args.n_fold}_{data_args.holdout_set}_prediction.json" ) else: output_prediction_file = os.path.join( training_args.output_dir, f"{split}_prediction.json") with open(output_prediction_file, "w") as f: json.dump(prediction, f)
def training_step(self, batch, batch_idx) -> Dict: global isEmUpdateBusy # use to check whether the entire embedding update process is finished or not global isAddIndexBusy # use to check whether the entire indexing process is finished or not global processes # use to keep threads embedding update processes global threadHandle_index # use to keep thread in embedding indexing processes if (self.trainer.global_rank == 0) and (self.custom_config.end2end): if (not batch_idx == 0) and ( batch_idx % self.custom_config.indexing_freq == 0): free_gpu_list = [] nvmlInit() deviceCount = nvmlDeviceGetCount() my_list = json.loads(self.custom_config.gpu_order) for i in range(deviceCount): handle = nvmlDeviceGetHandleByIndex(i) info = nvmlDeviceGetMemoryInfo(handle) if info.used / 1e6 < 15: position = my_list.index(i) free_gpu_list.append("cuda:" + str(position)) if len(free_gpu_list) >= self.custom_config.index_gpus: has_free_gpus = True else: has_free_gpus = False if (not isEmUpdateBusy) and has_free_gpus: model_copy = type(self.model.rag.ctx_encoder)( self.config_dpr ) # get a new instance #this will be load in the CPU model_copy.load_state_dict(self.model.rag.ctx_encoder. state_dict()) # copy weights processes = [] if len(free_gpu_list) > self.custom_config.index_gpus: cuda_devices = random.sample( free_gpu_list, self.custom_config.index_gpus) else: cuda_devices = free_gpu_list num_processes = len(cuda_devices) for rank in range(num_processes): logger.info( "Iniitializing embedding calculation process rank{}" .format(rank)) device = cuda_devices[rank] p = multiprocessing.Process( target=embed_update, args=( copy.deepcopy(model_copy), num_processes, device, rank, self.custom_config.shard_dir, self.custom_config.csv_path, ), ) processes.append(p) for p in processes: p.start() isEmUpdateBusy = True if isEmUpdateBusy and (not isAddIndexBusy): index_process_list = [ processes[k].is_alive() for k in range(self.custom_config.index_gpus) ] if ( sum(index_process_list) == 0 ): # If entire list is false, we can say all embedding calculation process has finished logger.info("Start adding the index") threadHandle_index = multiprocessing.Process( target=add_index, args=( self.custom_config.shard_dir, self.config.index_path, ), ) threadHandle_index.start() isAddIndexBusy = True # check when index building has started if isAddIndexBusy: # check still the index_building process is happening if not threadHandle_index.is_alive(): logger.info("Merging the dataset shards") saved_dataset_shards = [] for address in glob( str(self.custom_config.shard_dir) + "/*/"): saved_dataset_shards.append(load_from_disk(address)) concat = concatenate_datasets(saved_dataset_shards) concat.save_to_disk( self.config.passages_path ) # here we update the main passage file on the disk logger.info("done updating the dataset") # if you load the index from the disk make sure to update the index file here, otherwise it is ok to update the index file from the worker. # logger.info("then updating the index") # shutil.copy(self.custom_config.temp_index, self.config.idex_path) logger.info( "Loading new passages and iniitalzing new index") self.trainer.model.module.module.model.rag.retriever.re_load( ) self.trainer.model.module.module.model.rag.retriever.init_retrieval( ) isEmUpdateBusy = False isAddIndexBusy = False self.trainer.accelerator_connector.accelerator.barrier( "barrier") # waint untill the index and kb get re-initialized. loss_tensors = self._step(batch) logs = { name: loss for name, loss in zip(self.loss_names, loss_tensors) } # tokens per batch tgt_pad_token_id = (self.tokenizer.generator.pad_token_id if isinstance(self.tokenizer, RagTokenizer) else self.tokenizer.pad_token_id) src_pad_token_id = (self.tokenizer.question_encoder.pad_token_id if isinstance(self.tokenizer, RagTokenizer) else self.tokenizer.pad_token_id) logs["tpb"] = (batch["input_ids"].ne(src_pad_token_id).sum() + batch["decoder_input_ids"].ne(tgt_pad_token_id).sum()) self.log("loss", loss_tensors[0]) return loss_tensors[0]
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") if not os.path.exists(training_args.output_dir): os.makedirs(training_args.output_dir) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN, ) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info(f"Training/evaluation parameters {training_args}") # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below) # or specify a GLUE benchmark task (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use as labels the column called 'label' and as pair of sentences the # sentences in columns called 'sentence1' and 'sentence2' if such column exists or the first two columns not named # label if at least two columns are provided. # # If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this # single column. You can easily tweak this behavior (see below) # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.task_name is not None: # Downloading and loading a dataset from the hub. datasets = load_dataset("glue", data_args.task_name) elif data_args.train_file.endswith(".csv"): # Loading a dataset from local csv files datasets = load_dataset("csv", data_files={ "train": data_args.train_file, "validation": data_args.validation_file }) else: if not data_args.joint_training: # Loading a dataset from local json files datasets = load_dataset("json", data_files={ "train": data_args.train_file, "validation": data_args.validation_file }) else: # joint training datasets = {} for lang in MARC_LANGS: datasets[lang] = load_dataset( "json", data_files={ "train": data_args.train_file.format(lang), "validation": data_args.validation_file.format(lang) }) # See more about loading any type of standard or custom dataset at # https://huggingface.co/docs/datasets/loading_datasets.html. # Labels if data_args.task_name is not None: is_regression = data_args.task_name == "stsb" if not is_regression: label_list = datasets["train"].features["label"].names num_labels = len(label_list) else: num_labels = 1 else: # Trying to have good defaults here, don't hesitate to tweak to your needs. # if not data_args.joint_training: tmp = datasets["en"] if data_args.joint_training else datasets is_regression = tmp["train"].features[ data_args.label_column_name].dtype in ["float32", "float64"] if is_regression: num_labels = 1 else: # A useful fast method: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique label_list = tmp["train"].unique(data_args.label_column_name) label_list.sort() # Let's sort it for determinism num_labels = len(label_list) # Load pretrained model and tokenizer # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. # Preprocessing the datasets if data_args.task_name is not None: sentence1_key, sentence2_key = task_to_keys[data_args.task_name] else: # Again, we try to have some nice defaults but don't hesitate to tweak to your use case. tmp = datasets["en"] if data_args.joint_training else datasets non_label_column_names = [ name for name in tmp["train"].column_names if name != data_args.label_column_name ] if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names: sentence1_key, sentence2_key = "sentence1", "sentence2" else: #### completely modify this condition to use MARC sentence1_key, sentence2_key = "review_body", None # if len(non_label_column_names) >= 2: # sentence1_key, sentence2_key = non_label_column_names[:2] # else: # sentence1_key, sentence2_key = non_label_column#### completely modify this condition to use MARC _names[0], None #### completely modify this condition to use MARC # Padding strategy if data_args.pad_to_max_length: padding = "max_length" max_length = data_args.max_seq_length else: # We will pad later, dynamically at batch creation, to the max sequence length in each batch padding = False max_length = None # idx2token = {v: k for k, v in tokenizer.vocab.items()} tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, ) # Some models have set the order of the labels to use, so let's make sure we do use it. label_to_id = {v: i for i, v in enumerate(label_list)} # if ( # model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id # and data_args.task_name is not None # and is_regression # ): # # Some have all caps in their config, some don't. # label_name_to_id = {k.lower(): v for k, v in model.config.label2id.items()} # if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)): # label_to_id = {i: label_name_to_id[label_list[i]] for i in range(num_labels)} # else: # logger.warning( # "Your model seems to have been trained with labels, but they don't match the dataset: ", # f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}." # "\nIgnoring the model labels as a result.", # ) # elif data_args.task_name is None: # label_to_id = {v: i for i, v in enumerate(label_list)} def preprocess_function(examples): # Tokenize the texts args = ((examples[sentence1_key], ) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])) ### change tokenizer language code ### lang = examples["language"][0] lang_code = LANG2LANG_CODE[lang] tokenizer.set_src_lang_special_tokens(lang_code) ### change tokenizer language code ### result = tokenizer(*args, padding=padding, max_length=max_length, truncation=True) # Map labels to IDs (not necessary for GLUE tasks) if label_to_id is not None and data_args.label_column_name in examples: result["label"] = [ label_to_id[l] for l in examples[data_args.label_column_name] ] return result if not data_args.joint_training: datasets = datasets.map( preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache) else: for lang in MARC_LANGS: datasets[lang] = datasets[lang].map( preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache) merged_train_datasets = datasets_module.concatenate_datasets( [dataset["train"] for dataset in datasets.values()]) merged_dev_datasets = datasets_module.concatenate_datasets( [dataset["validation"] for dataset in datasets.values()]) datasets = DatasetDict(train=merged_train_datasets, validation=merged_dev_datasets) train_dataset = datasets["train"] eval_dataset = datasets["validation_matched" if data_args.task_name == "mnli" else "validation"] if data_args.task_name is not None: test_dataset = datasets["test_matched" if data_args.task_name == "mnli" else "test"] # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 3): logger.info( f"Sample {index} of the training set: {train_dataset[index]}.") # Get the metric function if data_args.task_name is not None: metric = load_metric("glue", data_args.task_name) # TODO: When datasets metrics include regular accuracy, make an else here and remove special branch from # compute_metrics # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a # predictions and label_ids field) and has to return a dictionary string to float. def compute_metrics(p: EvalPrediction): # TODO: modify accordingly preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1) if data_args.task_name is not None: result = metric.compute(predictions=preds, references=p.label_ids) if len(result) > 1: result["combined_score"] = np.mean(list( result.values())).item() return result elif is_regression: ### modify from MSE to MAE for MARC ### return {"mae": (np.abs(preds - p.label_ids)).mean().item()} ### modify from MSE to MAE for MARC ### else: return { "accuracy": (preds == p.label_ids).astype(np.float32).mean().item(), "mae": (np.abs(preds - p.label_ids)).mean().item() } NUM_LAYERS = data_args.num_layer LAYERS = list(range(2 * NUM_LAYERS)) NUM_HEADS = data_args.num_head for layer in LAYERS: heads = range( 2 * NUM_HEADS) if layer > NUM_LAYERS - 1 else range(NUM_HEADS) for head in heads: config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, ) model = MBartForSequenceClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset if training_args.do_eval else None, compute_metrics=compute_metrics, tokenizer=tokenizer, # Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding. data_collator=default_data_collator if data_args.pad_to_max_length else None, ) if data_args.do_prune: logger.info("*** Purturbing mBART ***") tasks = [data_args.task_name] eval_datasets = [eval_dataset] from ipdb import set_trace as bp # bp() model.prune_heads({layer: [head]}) # Training if training_args.do_train: train_result = trainer.train( model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) metrics = train_result.metrics trainer.save_model() # Saves the tokenizer too for easy upload output_train_file = os.path.join(training_args.output_dir, "train_results.txt") if trainer.is_world_process_zero(): with open(output_train_file, "w") as writer: logger.info("***** Train results *****") for key, value in sorted(metrics.items()): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") # Need to save the state, since Trainer.save_model saves only the tokenizer with the model trainer.state.save_to_json( os.path.join(training_args.output_dir, "trainer_state.json")) # Evaluation eval_results = {} if training_args.do_eval: logger.info("*** Evaluate ***") # Loop to handle MNLI double evaluation (matched, mis-matched) tasks = [data_args.task_name] eval_datasets = [eval_dataset] if data_args.task_name == "mnli": tasks.append("mnli-mm") eval_datasets.append(datasets["validation_mismatched"]) lang_head_specific_folder = os.path.join( training_args.output_dir, data_args.language, f"layer{layer}_head{head}") if not os.path.exists(lang_head_specific_folder): os.makedirs(lang_head_specific_folder) for eval_dataset, task in zip(eval_datasets, tasks): eval_result = trainer.evaluate(eval_dataset=eval_dataset) output_eval_file = os.path.join( lang_head_specific_folder, f"eval_results_{task}.txt") if trainer.is_world_process_zero(): with open(output_eval_file, "w") as writer: logger.info(f"***** Eval results {task} *****") for key, value in sorted(eval_result.items()): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") output_eval_file = os.path.join( lang_head_specific_folder, f"eval_results_{task}.json") if trainer.is_world_process_zero(): with open(output_eval_file, "w") as writer: import json json.dump(eval_result, writer) eval_results.update(eval_result) if training_args.do_predict: logger.info("*** Test ***") # Loop to handle MNLI double evaluation (matched, mis-matched) tasks = [data_args.task_name] test_datasets = [test_dataset] if data_args.task_name == "mnli": tasks.append("mnli-mm") test_datasets.append(datasets["test_mismatched"]) for test_dataset, task in zip(test_datasets, tasks): # Removing the `label` columns because it contains -1 and Trainer won't like that. test_dataset.remove_columns_("label") predictions = trainer.predict( test_dataset=test_dataset).predictions predictions = np.squeeze( predictions) if is_regression else np.argmax( predictions, axis=1) output_test_file = os.path.join( training_args.output_dir, f"test_results_{task}.txt") if trainer.is_world_process_zero(): with open(output_test_file, "w") as writer: logger.info(f"***** Test results {task} *****") writer.write("index\tprediction\n") for index, item in enumerate(predictions): if is_regression: writer.write(f"{index}\t{item:3.3f}\n") else: item = label_list[item] writer.write(f"{index}\t{item}\n")
examples['n_real'] = [sum([0 if cls.tokenizer.convert_ids_to_tokens(i).startswith('##') else 1 for i in line]) - 2 for line in examples['input_ids']] return examples if __name__ == '__main__': from utils import get_tokenizer from copy import deepcopy t=get_tokenizer('bert-base-chinese', is_zh=True) ds = get_tokenized_ds('hfds_scripts/atec_dataset.py', '../sentence-embedding/data/ATEC/atec_nlp_sim_train.csv', t, tokenize_type='with_prefix') ds = ds['atec'] ds2=deepcopy(ds) for index, ds_ in enumerate([ds, ds2]): features=list(ds_.features) for feature in features: if index: if feature.startswith('textb') or feature == 'label': ds_.remove_columns_(feature) else: ds_.rename_column_(feature, feature[6:]) else: if feature.startswith('texta') or feature == 'label': ds_.remove_columns_(feature) else: ds_.rename_column_(feature, feature[6:]) ds=concatenate_datasets([ds, ds2]) print(ds)