Exemple #1
0
def create_pseudo_labeled_data(args, infer_input, infer_output, eval_result,
                               id2label, next_data_dir):
    """Create pseudeo labeled data for the next self-training iteration."""

    dataset = datasets.concatenate_datasets([infer_input, infer_output],
                                            axis=1)

    if args.do_filter_by_confidence:
        dataset = dataset.filter(
            lambda example: example['probability'] > args.confidence_threshold)

    if args.do_filter_by_val_performance:
        assert eval_result >= 0.0 and eval_result <= 1.0
        num_selected_rows = int(eval_result * len(dataset))
        print(num_selected_rows)
        dataset = dataset.sort('probability', reverse=True)
        dataset = dataset.select(range(num_selected_rows))

    dataset = dataset.remove_columns(['label', 'probability'])
    dataset = dataset.rename_column('prediction', 'label')
    dataset = dataset.map(
        lambda example: {'label': id2label[example['label']]})
    dataset = dataset.shuffle(seed=args.seed)

    pseudo_labeled_data_file = os.path.join(
        next_data_dir, f'train_pseudo.{args.data_file_extension}')
    if args.data_file_extension == 'csv':
        dataset.to_csv(pseudo_labeled_data_file, index=False)
    else:
        dataset.to_json(pseudo_labeled_data_file)
Exemple #2
0
def concatenate_datasets_with_ratio(args, train_dataset):
    concatenate_list = []

    for sub_dataset_name, ratio in zip(
            args.data.sub_datasets.split(","),
            args.data.sub_datasets_ratio.split(",")):
        ratio = float(ratio)
        sub_dataset_path = p.join(args.path.train_data_dir, sub_dataset_name)
        assert p.exists(sub_dataset_path), f"{sub_dataset_name}이 존재하지 않습니다."

        sub_dataset = load_from_disk(sub_dataset_path)
        sub_dataset_len = int(len(sub_dataset["train"]) * ratio)

        print(f"ADD SUB DATASET {sub_dataset_name}, LENGTH: {sub_dataset_len}")

        # sub dataset must have same features: ['id', 'title', 'context', 'question', 'answers']
        features = sub_dataset["train"].features

        new_sub_dataset = sub_dataset["train"].select(range(sub_dataset_len))
        new_sub_dataset = Dataset.from_pandas(new_sub_dataset.to_pandas(),
                                              features=features)

        concatenate_list.append(new_sub_dataset.flatten_indices())

    train_dataset = Dataset.from_pandas(train_dataset.to_pandas(),
                                        features=features)
    train_dataset = concatenate_datasets([train_dataset.flatten_indices()] +
                                         concatenate_list)

    return train_dataset
Exemple #3
0
def concate(dataset_name, data, cache_dir):
    if dataset_name in dataset_types:
        all_datasets_downloaded = [
            load_dataset(dataset_name, sub_dataset, cache_dir=cache_dir)
            for sub_dataset in dataset_types[dataset_name]
        ]
        combined_datasets = [
            concatenate_datasets(list(sub_dataset.values()))
            for sub_dataset in all_datasets_downloaded
        ]
        data = concatenate_datasets(combined_datasets)
        return DatasetDict({"train": data})
    data = concatenate_datasets(
        list(load_dataset(dataset_name, cache_dir=cache_dir).values())
    )
    return DatasetDict({"train": data})
def download_dset(c, hf_tokenizer, cache_dir, num_proc):
    dsets = []
    ELECTRAProcessor = partial(ELECTRADataProcessor,
                               hf_tokenizer=hf_tokenizer,
                               max_length=c.max_length)
    # Wikipedia
    if 'wikipedia' in c.datas:
        print('load/download wiki dataset')
        wiki = datasets.load_dataset('wikipedia',
                                     '20200501.en',
                                     cache_dir=cache_dir)['train']
        print('load/create data from wiki dataset for ELECTRA')
        e_wiki = ELECTRAProcessor(wiki).map(
            cache_file_name=f"1000_electra_wiki_{c.max_length}.arrow",
            num_proc=num_proc)
        dsets.append(e_wiki)

    # OpenWebText
    if 'openwebtext' in c.datas:
        print('load/download OpenWebText Corpus')
        owt = datasets.load_dataset('openwebtext',
                                    cache_dir=cache_dir)['train']
        print('load/create data from OpenWebText Corpus for ELECTRA')
        e_owt = ELECTRAProcessor(owt, apply_cleaning=False).map(
            cache_file_name=f"electra_owt_{c.max_length}.arrow",
            num_proc=num_proc)
        dsets.append(e_owt)

    assert len(dsets) == len(c.datas)

    train_dset = datasets.concatenate_datasets(dsets)
    return train_dset
def step_3(dataset, args):
    """
    Balance positive and negative samples by randomly removing examples from the larger-sized class
    """
    print('In Step 3')
    unq_ele_and_cts = Counter(dataset['label'])
    assert len(unq_ele_and_cts) == 2, 'There should only be two unique labels'

    key_with_max_val = max(unq_ele_and_cts, key=unq_ele_and_cts.get)
    key_with_min_val = min(unq_ele_and_cts, key=unq_ele_and_cts.get)

    extra_count = abs(unq_ele_and_cts[key_with_max_val] -
                      unq_ele_and_cts[key_with_min_val])

    ## Divide dataset into two datasets based on their labels
    def filter_based_on_label(sample, idx, label):
        return label == sample['label']

    key_with_data = defaultdict()
    for key in unq_ele_and_cts.keys():
        key_with_data[key] = dataset.filter(filter_based_on_label,
                                            fn_kwargs={'label': key},
                                            with_indices=True)

    ## Remove extra counts from class with more elements
    def filter_extra_samples_from_key_with_max_val(sample, idx, extra_count):
        return False if idx < extra_count else True

    key_with_data[key_with_max_val] = key_with_data[key_with_max_val].shuffle(
        seed=args.seed).filter(filter_extra_samples_from_key_with_max_val,
                               fn_kwargs={'extra_count': extra_count},
                               with_indices=True)
    ## Combine the two datasets
    return concatenate_datasets(list(key_with_data.values())).sort('idx')
Exemple #6
0
def make_kor_dataset_v1(args):
    """KorQuad Dataset V1
    1. 문서 길이 512이하 Filtering
    2. Context당 Question 최대 4개
    3. ans_start 위치로 8000개 샘플링
    """

    kor_dataset_path = p.join(args.path.train_data_dir, "kor_dataset")

    if p.exists(kor_dataset_path):
        raise FileExistsError(f"{kor_dataset_path}는 이미 존재하는 파일입니다!")

    kor_dataset = load_dataset("squad_kor_v1")

    kor_dataset = concatenate_datasets([
        kor_dataset["train"].flatten_indices(),
        kor_dataset["validation"].flatten_indices()
    ])

    # (1) 문서 길이: KLUE MRC 512가 최소 길이
    kor_dataset = filtering_by_doc_len(kor_dataset, doc_len=512)

    # (2) 중복 Context 제거: Context당 최대 4개의 질문
    kor_dataset = filtering_by_dup_question(kor_dataset, dup_limit=4)

    # (3) KOR answer_start Weight Sampling 2배수 사용
    kor_dataset = sampling_by_ans_start_weights(kor_dataset, sample=8000)

    # (4) KOR_DATASET만 저장
    kor_datasets = DatasetDict({"train": kor_dataset})
    kor_datasets.save_to_disk(kor_dataset_path)

    print(f"{kor_dataset_path}에 저장되었습니다!")
def load_and_concatenate_datasets(data_args):
    """Load and concatenate multiple compatible datasets"""
    train_datasets, validation_datasets = [], []
    for name, config in zip(data_args.dataset_name,
                            data_args.dataset_config_name):

        dataset = load_dataset(name, config)
        if "validation" not in dataset.keys():
            validation_ds = load_dataset(
                name,
                config,
                split=f"train[:{data_args.validation_split_percentage}%]",
            )
            train_ds = load_dataset(
                name,
                config,
                split=f"train[{data_args.validation_split_percentage}%:]",
            )
        else:
            validation_ds = dataset["validation"]
            train_ds = dataset["train"]

        # Some specific preprocessing to align fields on known datasets
        # extraneous fields not used in language modeling are also removed
        # after preprocessing
        if name == "wikipedia":
            train_ds.remove_columns_("title")
            validation_ds.remove_columns_("title")
        elif name == "ptb_text_only":
            train_ds.rename_column_("sentence", "text")
            validation_ds.rename_column_("sentence", "text")

        train_datasets.append(train_ds)
        validation_datasets.append(validation_ds)

    for ds_idx in range(1, len(train_datasets)):
        assert train_datasets[ds_idx].features.type == \
            train_datasets[ds_idx - 1].features.type, \
            "Features name and type must match between all datasets"

    datasets = DatasetDict()
    datasets["train"] = concatenate_datasets(train_datasets)
    datasets["validation"] = concatenate_datasets(validation_datasets)

    return datasets
Exemple #8
0
    def from_datasets(cls, dataset_list: List["Dataset"]) -> "Dataset":
        """Create a single Dataset by concatenating a list of datasets

        Parameters
        ----------
        dataset_list
            Datasets to be concatenated. They must have the same column types.

        Returns
        -------
        dataset
        """
        return cls(datasets.concatenate_datasets([ds.dataset for ds in dataset_list]))
def test_dataset_concatenate_audio_features(shared_datadir):
    # we use a different data structure between 1 and 2 to make sure they are compatible with each other
    audio_path = str(shared_datadir / "test_audio_44100.wav")
    data1 = {"audio": [audio_path]}
    dset1 = Dataset.from_dict(data1, features=Features({"audio": Audio()}))
    data2 = {"audio": [{"bytes": open(audio_path, "rb").read()}]}
    dset2 = Dataset.from_dict(data2, features=Features({"audio": Audio()}))
    concatenated_dataset = concatenate_datasets([dset1, dset2])
    assert len(concatenated_dataset) == len(dset1) + len(dset2)
    assert concatenated_dataset[0]["audio"]["array"].shape == dset1[0][
        "audio"]["array"].shape
    assert concatenated_dataset[1]["audio"]["array"].shape == dset2[0][
        "audio"]["array"].shape
def add_index(shard_dir, index_path):
    data_shard_list = []

    for shard_address in glob(str(shard_dir) + "/*/"):
        data_shard_list.append(load_from_disk(shard_address))

    concat = concatenate_datasets(data_shard_list)
    faiss.omp_set_num_threads(96)

    index = faiss.IndexHNSWFlat(768, 128, faiss.METRIC_INNER_PRODUCT)
    concat.add_faiss_index("embeddings", custom_index=index)
    concat.get_index("embeddings").save(
        index_path
    )  # since we load the index in to memory,we can directly update the index in the disk
Exemple #11
0
    def extend_length(self, length):
        """
        Extends the length of the dataset by randomly repeating length amount of rows.
        """

        len_ = len(self)

        if length < len_:
            raise('Should not decrease the length of dataset')

        rand_indices = np.random.randint(len_, size=length - len_)
        columns = self.dataset.format['columns']

        additional_data = self.dataset.select(rand_indices)
        self.dataset = concatenate_datasets([self.dataset, additional_data])
        self.dataset.set_format(type=self.dataset.format["type"], columns=columns)
Exemple #12
0
    def process_hf_dataset(self,
                           training_ds: Dataset,
                           validation_ds: Optional[Dataset] = None):
        ds = training_ds

        # concatenate the validation dataset if it is included
        if validation_ds is not None:
            # add an "is_valid_col" column to both training/validation DataFrames to indicate what data is part of
            # the validation set
            if self.is_valid_attr:
                validation_ds = validation_ds.add_column(
                    self.is_valid_attr, [True] * len(validation_ds))
                training_ds = training_ds.add_column(
                    self.is_valid_attr, [False] * len(training_ds))

            ds = concatenate_datasets([training_ds, validation_ds])

        return ds
Exemple #13
0
def create_dataloader(args,
                      dataset,
                      tokenizer,
                      k_shot=False,
                      num_classes=None):
    """
    Function to create a PyTorch Dataloader from a given dataset.
    Inputs:
        args - Namespace object from the argument parser
        dataset - Dataset to convert to Dataloader
        tokenizer - BERT tokenizer instance
        k_shot - Indicates whether to make the training set k-shot. Default is False
        num_classes - Number of classes in the dataset. Default is None
    Outputs:
        dataset - DataLoader object of the dataset
    """

    # check if k-shot
    new_dataset = []
    if k_shot:
        for current_class in range(0, num_classes):
            class_set = dataset.filter(
                lambda example: example['labels'] == current_class)
            class_set = class_set.shuffle()
            class_set = class_set.filter(lambda e, i: i < args.k,
                                         with_indices=True)
            new_dataset.append(class_set)
        dataset = concatenate_datasets(new_dataset)

    # create a data collator function
    data_collator = DataCollatorWithPadding(tokenizer)

    # create the dataloader
    dataset = DataLoader(
        dataset,
        batch_size=args.batch_size,
        collate_fn=data_collator,
        drop_last=False,
        shuffle=True,
    )

    # return the dataset
    return dataset
Exemple #14
0
def load_qa_dataset(examples_dirpath, scratch_dirpath, clean_model_filepaths=None, more_clean_data=False):
    clean_fns = []
    if more_clean_data:
        for model_type_paths in clean_model_filepaths.values():
            clean_examples_dirpath_list = ['/'.join(v.split('/')[:-1]+['example_data']) for v in model_type_paths]
            for dirpath in clean_examples_dirpath_list:
                clean_fns += [os.path.join(dirpath, fn) for fn in os.listdir(dirpath) if (fn.endswith('.json') and 'clean'in fn)]

    fns = [os.path.join(examples_dirpath, fn) for fn in os.listdir(examples_dirpath) if (fn.endswith('.json') and 'clean'in fn)]
    fns.sort()
    
    examples_filepath_list = fns + clean_fns

    dataset_list = []
    for examples_filepath in examples_filepath_list:
        # Load the examples
        dataset_list.append(datasets.load_dataset('json', data_files=[examples_filepath], field='data', keep_in_memory=True, split='train', cache_dir=os.path.join(scratch_dirpath, '.cache')))

    return datasets.concatenate_datasets(dataset_list)        
    def _exec_logistic_regression(self):
        datasets = load_from_disk(
            p.join(self.args.path.train_data_dir, "train_dataset"))

        train_dataset = concatenate_datasets(
            [datasets["train"], datasets["validation"]])

        queries = train_dataset["question"]
        doc_scores, doc_indices = self.sparse_retriever.get_relevant_doc_bulk(
            queries, topk=8)
        doc_scores, doc_indices = np.array(doc_scores), np.array(doc_indices)

        contexts = np.array(self.sparse_retriever.contexts)

        train_x, train_y = [], []

        for idx in tqdm.tqdm(range(len(doc_scores))):
            doc_index = doc_indices[idx]
            org_context = train_dataset["context"][idx]

            feature_vector = [
                doc_scores[idx][:pow(2, i)]
                for i in range(1, self.num_features + 1)
            ]
            feature_vector = list(map(lambda x: x.mean(), feature_vector))
            feature_vector = softmax(feature_vector)

            label = 0
            y = -1
            if org_context in contexts[doc_index]:
                y = list(contexts[doc_index]).index(org_context)
            if y != -1 and y < self.kbound:
                label = 1

            train_x.append(feature_vector)
            train_y.append(label)

        logistic = LogisticRegression()
        logistic.fit(train_x, train_y)

        return logistic
def main(args):
    mrc_test_dataset = load_from_disk(p.join(args.path.train_data_dir, "test_dataset"))
    mrc_dummy_dataset = load_from_disk(p.join(args.path.train_data_dir, "dummy_dataset"))

    all_mrc_dummy_dataset = concatenate_datasets(
        [mrc_dummy_dataset["train"].flatten_indices(), mrc_dummy_dataset["validation"].flatten_indices()]
    )

    cheat_ids = list(set(mrc_test_dataset["validation"]["id"]).intersection(set(all_mrc_dummy_dataset["id"])))

    cheats = {}

    for cheat_id in cheat_ids:  # ex) cheat_id: 'mrc-1-000711'
        temp = check_is_real_cheating(mrc_test_dataset, all_mrc_dummy_dataset, cheat_id)
        cheats[cheat_id] = temp

    cheat_path = p.join(args.path.train_data_dir, "cheat.json")

    print(cheats)

    with open(cheat_path, "w") as f:
        f.write(json.dumps(cheats, indent=4, ensure_ascii=False) + "\n")
def createDataset(config):
    """
    build dataset from the h5 file
    also filter out rare *individual ATU*
    """
    df = pd.read_hdf(config.data.h5_file, key=config.data.h5_key)
    atu = df.loc[df.groupby("atu")["atu"].filter(
        lambda g: len(g) >= config["datamodules"]["atu_filter_no"]).index]
    atu = atu[["text", "atu", "desc", "label"]]

    dataset = Dataset.from_pandas(atu)
    tokenizer = AutoTokenizer.from_pretrained(config["module"]["arch"])

    def tokenize(instance):
        return tokenizer(instance["text"],
                         max_length=config["module"]["seq_len"],
                         truncation="longest_first",
                         padding="max_length")

    dataset = dataset. \
        shuffle(seed=config.seed). \
        map(tokenize, batched=True)

    # split by cls (stratified)
    sub_ds = {"train": [], "test": []}
    for cls in np.unique(dataset["label"]):
        cls_ds = dataset. \
            filter(lambda d: d['label'] == int(cls))
        cls_ds = cls_ds.train_test_split(test_size=config.data.test_ratio,
                                         seed=config.seed)
        sub_ds["train"].append(cls_ds["train"])
        sub_ds["test"].append(cls_ds["test"])

    dataset = DatasetDict(
        {split: concatenate_datasets(ds)
         for split, ds in sub_ds.items()})
    dataset.save_to_disk(config.data.cached_dir)
    return dataset
Exemple #18
0
                       cols=TEXT_COLS[task],
                       max_len=c.max_length)
    glue_dsets[task] = dsets.my_map(
        tok_func, cache_file_names=f"tokenized_{c.max_length}_{{split}}")

    if c.double_unordered and task in ["mrpc", "stsb"]:
        swap_tok_func = partial(
            tokenize_sents_max_len,
            cols=TEXT_COLS[task],
            max_len=c.max_length,
            swap=True,
        )
        swapped_train = dsets["train"].my_map(
            swap_tok_func,
            cache_file_name=f"swapped_tokenized_{c.max_length}_train")
        glue_dsets[task]["train"] = datasets.concatenate_datasets(
            [glue_dsets[task]["train"], swapped_train])

    # Load / Make dataloaders
    hf_dsets = HF_Datasets(
        glue_dsets[task],
        hf_toker=hf_tokenizer,
        n_inp=3,
        cols={
            "inp_ids": TensorText,
            "attn_mask": noop,
            "token_type_ids": noop,
            "label": TensorCategory,
        },
    )
    if c.double_unordered and task in ["mrpc", "stsb"]:
        dl_kwargs = {
    AutoTokenizer,
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    default_data_collator,
    set_seed,
)


wiki = load_dataset("wikipedia", "20200501.en", split="train") # len(wiki):6078422
bookcorpus = load_dataset("bookcorpus", split="train") # len(bookcorpus): 74004228

print(wiki.column_names, bookcorpus.column_names)
# ['title', 'text'] ['text']
wiki.remove_columns_("title")
bart_dataset = concatenate_datasets([wiki, bookcorpus]) # len(bart_dataset):80082650

tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large', use_fast=True)

# Preprocessing the datasets.
# First we tokenize all the texts.
column_names = bart_dataset.column_names
text_column_name = "text" if "text" in column_names else column_names[0]

def tokenize_function(examples):
    return tokenizer(examples[text_column_name])

tokenized_datasets = bart_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=16,
Exemple #20
0

timit = load_dataset('timit_asr')


def is_discardable(batch):
    for phoneme in batch["phonetic_detail"]["utterance"]:
        if phoneme in TIMIT_DISCARD:
            return False
    return True


timit_filt = timit["train"].filter(lambda eg: is_discardable(eg))

timit_filt2 = timit["test"].filter(lambda eg: is_discardable(eg))

timit = concatenate_datasets([timit_filt, timit_filt2])

MAX_TOKENS = 1120000

BASE = timit[0]["file"].split("/data/")[0] + "/data/"

with open("train_timit.tsv", "w") as manifest, open("train_timit.ltr",
                                                    "w") as transcript:
    manifest.write(BASE + "\n")
    for item in timit:
        frames, sr = sf.read(item["file"])
        manifest.write(f"{item['file'].replace(BASE, '')}\t{len(frames)}\n")
        utt = item['phonetic_detail']['utterance']
        mapped = map_timit_to_cmudict(utt)
        transcript.write(f"{' '.join(mapped)}\n")
Exemple #21
0
parser.add_argument("output_dir")
parser.add_argument("--lr", default=3e-5, type=float)
parser.add_argument("--epochs", default=3, type=int)
parser.add_argument("--batch_size", default=128, type=int)
parser.add_argument("--fp16", action="store_true")
args = parser.parse_args()

# Load dataset. TODO: Do this in prepare_smnli.py instead
dataset_list = [
    load_from_disk("data/mnli-tokenized"),
    load_from_disk("data/snli-tokenized"),
]
train_dataset = concatenate_datasets([
    Dataset.from_dict({
        "attention_mask": dataset["train"]["attention_mask"],
        "input_ids": dataset["train"]["input_ids"],
        "token_type_ids": dataset["train"]["token_type_ids"],
        "label": dataset["train"]["label"],
    }) for dataset in dataset_list
])


def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions,
                                           tuple) else p.predictions
    preds = np.argmax(preds, axis=1)
    return {
        "accuracy": (preds == p.label_ids).astype(np.float32).mean().item()
    }


# Prepare and train model
def get_train_dataset(args):
    args.data.dataset_name = "train_dataset"
    datasets = get_dataset(args, is_train=True)
    datasets = concatenate_datasets(
        [datasets["train"], datasets["validation"]])
    return datasets
Exemple #23
0
        doc_scores = []
        doc_indices = []
        for query in queries:
            ret0, ret1 = self.get_relevant_doc(query, k)
            doc_scores.append(ret0)
            doc_indices.append(ret1)

        return doc_scores, doc_indices


if __name__ == "__main__":
    # Test sparse
    org_dataset = load_from_disk("data/train_dataset")
    full_ds = concatenate_datasets([
        org_dataset["train"].flatten_indices(),
        org_dataset["validation"].flatten_indices(),
    ])  # train dev 를 합친 4192 개 질문에 대해 모두 테스트
    print("*" * 40, "query dataset", "*" * 40)
    print(full_ds)

    mecab = Mecab()

    def tokenize(text):
        # return text.split(" ")
        return mecab.morphs(text)

    wiki_path = "wikipedia_documents.json"
    retriever = BM25Arti(
        # tokenize_fn=tokenizer.tokenize,
        tokenize_fn=tokenize,
        data_path="data",
Exemple #24
0
        if not os.path.exists(train_save_path):
            print("generating new, class-balanced datasets...")
            train_datasets = []
            valid_datasets = []
            for c in range(num_classes):
                class_dataset = dataset.filter(
                    lambda example: example['label'] == c).shuffle()
                train_datasets.append(
                    class_dataset.select(range(num_train_per_class)))
                if not os.path.exists(valid_save_path):
                    valid_datasets.append(
                        class_dataset.select(
                            range(num_train_per_class,
                                  num_valid_per_class + num_train_per_class)))

            train_dataset = concatenate_datasets(train_datasets).shuffle()
            train_dataset.save_to_disk(train_save_path)

            if not os.path.exists(valid_save_path):
                valid_dataset = concatenate_datasets(valid_datasets).shuffle()
                valid_dataset.save_to_disk(valid_save_path)

        else:
            print("loading {}...".format(train_save_path))
            # abridged, class-balanced dataset already exists, so just load it
            train_dataset = load_from_disk(train_save_path)

        for t in ts:

            tran_train_save_path = os.path.join(
                data_dir, task, t, task + '_train_' + str(num_train_per_class))
Exemple #25
0
dataset_448.save_to_disk("/home/ahemf/processed/c4_448")

c4 = DatasetDict.load_from_disk("/home/ahemf/processed/c4_448")
dsets = Dataset.load_from_disk("/home/ahemf/processed/dsets_448")

c4['train'] = c4['train'].add_column('dataset', ['c4'] * len(c4['train']))
c4['train'] = c4['train'].remove_columns(['url', 'timestamp'])
c4['validation'] = c4['validation'].remove_columns(['url', 'timestamp'])
c4['validation'] = c4['validation'].add_column('dataset',
                                               ['c4'] * len(c4['validation']))

dataset_col = dsets['dataset']
dsets = dsets.remove_columns(["dataset"])
dsets = dsets.add_column("dataset", dataset_col)

c4["train"] = concatenate_datasets([c4["train"], dsets])
c4["train"].save_to_disk("/home/ahemf/processed/c4_extended")

c4 = Dataset.load_from_disk("/home/ahemf/processed/c4_extended")

###################################################################
## TF-IDF
###################################################################

from collections import Counter
from transformers import AutoTokenizer, AutoModel, RobertaTokenizerFast
from datasets import load_dataset, concatenate_datasets, Dataset, DatasetDict
import os
import numpy as np
import re
from multiprocess.pool import Pool
Exemple #26
0
 def __get_all_train(self):
     all_data = concatenate_datasets([self.snli['train'], self.mnli['train']])
     return self.__get_data_loader(all_data)
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (BasicModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    checkpoint_dir = hyperparam_path_for_baseline(model_args, data_args,
                                                  training_args)
    ckpt_dir = Path(checkpoint_dir)
    postfix = ""
    if training_args.do_train:
        postfix += "_train"
    if training_args.do_eval:
        postfix += "_eval"
    setup_root_logger(ckpt_dir,
                      training_args.local_rank,
                      debug=False,
                      postfix=postfix)

    training_args.output_dir = checkpoint_dir

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
        +
        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).

    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
    # 'text' is found. You can easily tweak this behavior (see below).
    if not 0 <= data_args.holdout_set < data_args.n_fold:
        raise ValueError("Test fold must be in [0, n_fold)")

    if data_args.dataset not in ['race', 'dream']:
        raise ValueError("Dataset should be race or dream.")
    else:
        from mcmrc.data_utils.processors import prepare_features

    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    if data_args.debug_mode:
        datasets = load_dataset(data_args.dataload_script,
                                data_args.dataload_split,
                                data_dir=data_args.data_dir,
                                split={
                                    'train':
                                    ReadInstruction('train',
                                                    from_=0,
                                                    to=5,
                                                    unit='abs'),
                                    'validation':
                                    ReadInstruction('validation',
                                                    from_=0,
                                                    to=5,
                                                    unit='abs'),
                                    'test':
                                    ReadInstruction('test',
                                                    from_=0,
                                                    to=5,
                                                    unit='abs')
                                })
    else:
        datasets = load_dataset(data_args.dataload_script,
                                data_args.dataload_split,
                                data_dir=data_args.data_dir)

    if data_args.shuffle_train_dataset:
        datasets['train'] = datasets['train'].shuffle(seed=training_args.seed)

    if data_args.split_train_dataset:
        holdout_set_start = int(
            len(datasets['train']) / data_args.n_fold * data_args.holdout_set)
        holdout_set_end = int(
            len(datasets['train']) / data_args.n_fold *
            (data_args.holdout_set + 1))
        shuffled_train_set = datasets['train'].shuffle(seed=training_args.seed)
        if holdout_set_start == 0:
            new_train_set = Dataset.from_dict(
                shuffled_train_set[holdout_set_end:])
        elif holdout_set_end == len(datasets['train']):
            new_train_set = Dataset.from_dict(
                shuffled_train_set[:holdout_set_start])
        else:
            new_train_set = concatenate_datasets([
                Dataset.from_dict(shuffled_train_set[:holdout_set_start]),
                Dataset.from_dict(shuffled_train_set[holdout_set_end:])
            ])

        new_holdout_set = Dataset.from_dict(
            shuffled_train_set[holdout_set_start:holdout_set_end])
        assert new_train_set.num_rows + new_holdout_set.num_rows == shuffled_train_set.num_rows
        datasets['train'] = new_train_set
        datasets['holdout_set'] = new_holdout_set

    # Load pretrained model and tokenizer

    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    config = AutoConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=model_args.use_fast_tokenizer,
    )
    model = AutoModelForMultipleChoice.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
    )

    if training_args.do_train:
        column_names = datasets["train"].column_names
    else:
        column_names = datasets["validation"].column_names

    pprepare_features = partial(prepare_features,
                                tokenizer=tokenizer,
                                data_args=data_args)
    tokenized_datasets = datasets.map(
        pprepare_features,
        batched=True,
        num_proc=data_args.preprocessing_num_workers,
        remove_columns=column_names,
        load_from_cache_file=not data_args.overwrite_cache,
    )

    # Data collator
    data_collator = (default_data_collator if data_args.pad_to_max_length else
                     DataCollatorForMultipleChoice(tokenizer=tokenizer))

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"]
        if training_args.do_train else None,
        eval_dataset=tokenized_datasets["validation"]
        if training_args.do_eval else None,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_mc_metrics,
    )

    # Training
    if training_args.do_train:
        train_result = trainer.train()

        output_train_file = os.path.join(training_args.output_dir,
                                         "train_results.txt")
        with open(output_train_file, "w") as writer:
            logger.info("***** Train results *****")
            for key, value in sorted(train_result.metrics.items()):
                logger.info(f"{key} = {value:.3f}")
                writer.write(f"{key} = {value:.3f}\n")

    # Evaluation
    # To use the best checkpoint model at end, use the aruguments
    # load_best_model_at_end, metric_for_best_model, evaluation_strategy steps
    # --load_best_model_at_end \
    # --metric_for_best_model accuracy \
    # --evaluation_strategy steps \
    if training_args.do_eval:

        if training_args.load_best_model_at_end:
            best_model = AutoModelForMultipleChoice.from_pretrained(
                training_args.output_dir,
                from_tf=bool(".ckpt" in model_args.model_name_or_path),
                config=config,
                cache_dir=model_args.cache_dir,
            )
            best_model = best_model.to(training_args.device)

        for split in [k for k in datasets.keys() if k != "train"]:
            logger.info(f"*** Evaluate {split} set ***")
            results = trainer.evaluate(tokenized_datasets[split])
            if training_args.load_best_model_at_end:
                final_model = trainer.model
                trainer.model = best_model
                best_model_results = trainer.evaluate(
                    tokenized_datasets[split])
                trainer.model = final_model

            output_eval_file = os.path.join(training_args.output_dir,
                                            f"{split}_results.txt")
            with open(output_eval_file, "a+") as writer:
                logger.info("***** Extensive Eval results *****")
                if not training_args.do_train:
                    writer.write(
                        f"eval checkpoint {model_args.model_name_or_path}\n")
                for key, value in sorted(results.metrics.items()):
                    logger.info(f"{key} = {value:.3f}")
                    writer.write(f"{key} = {value:.3f}\n")
                if training_args.load_best_model_at_end:
                    writer.write(f"best model on dev set\n")
                    for key, value in sorted(
                            best_model_results.metrics.items()):
                        logger.info(f"{key} = {value:.3f}")
                        writer.write(f"{key} = {value:.3f}\n")
            if data_args.output_prediction_file or data_args.split_train_dataset:
                prediction = {
                    example_id: prediction.tolist()
                    for prediction, label_id, example_id in zip(*results[:-1])
                }
                if split == "holdout_set":
                    output_prediction_file = os.path.join(
                        training_args.output_dir,
                        f"holdout_{data_args.n_fold}_{data_args.holdout_set}_prediction.json"
                    )
                else:
                    output_prediction_file = os.path.join(
                        training_args.output_dir, f"{split}_prediction.json")
                with open(output_prediction_file, "w") as f:
                    json.dump(prediction, f)
Exemple #28
0
    def training_step(self, batch, batch_idx) -> Dict:

        global isEmUpdateBusy  # use to check whether the entire embedding update process is finished or not
        global isAddIndexBusy  # use to check whether the entire indexing process  is finished or not
        global processes  # use to keep threads embedding update processes
        global threadHandle_index  # use to keep thread in embedding indexing processes

        if (self.trainer.global_rank == 0) and (self.custom_config.end2end):

            if (not batch_idx == 0) and (
                    batch_idx % self.custom_config.indexing_freq == 0):
                free_gpu_list = []
                nvmlInit()
                deviceCount = nvmlDeviceGetCount()

                my_list = json.loads(self.custom_config.gpu_order)

                for i in range(deviceCount):
                    handle = nvmlDeviceGetHandleByIndex(i)
                    info = nvmlDeviceGetMemoryInfo(handle)

                    if info.used / 1e6 < 15:
                        position = my_list.index(i)
                        free_gpu_list.append("cuda:" + str(position))

                if len(free_gpu_list) >= self.custom_config.index_gpus:
                    has_free_gpus = True

                else:
                    has_free_gpus = False

                if (not isEmUpdateBusy) and has_free_gpus:

                    model_copy = type(self.model.rag.ctx_encoder)(
                        self.config_dpr
                    )  # get a new instance  #this will be load in the CPU
                    model_copy.load_state_dict(self.model.rag.ctx_encoder.
                                               state_dict())  # copy weights

                    processes = []

                    if len(free_gpu_list) > self.custom_config.index_gpus:
                        cuda_devices = random.sample(
                            free_gpu_list, self.custom_config.index_gpus)
                    else:
                        cuda_devices = free_gpu_list

                    num_processes = len(cuda_devices)

                    for rank in range(num_processes):
                        logger.info(
                            "Iniitializing  embedding calculation process rank{}"
                            .format(rank))
                        device = cuda_devices[rank]
                        p = multiprocessing.Process(
                            target=embed_update,
                            args=(
                                copy.deepcopy(model_copy),
                                num_processes,
                                device,
                                rank,
                                self.custom_config.shard_dir,
                                self.custom_config.csv_path,
                            ),
                        )
                        processes.append(p)

                    for p in processes:
                        p.start()

                    isEmUpdateBusy = True

            if isEmUpdateBusy and (not isAddIndexBusy):
                index_process_list = [
                    processes[k].is_alive()
                    for k in range(self.custom_config.index_gpus)
                ]
                if (
                        sum(index_process_list) == 0
                ):  # If entire list is false, we can say all embedding calculation process has finished
                    logger.info("Start adding the index")
                    threadHandle_index = multiprocessing.Process(
                        target=add_index,
                        args=(
                            self.custom_config.shard_dir,
                            self.config.index_path,
                        ),
                    )
                    threadHandle_index.start()
                    isAddIndexBusy = True

            # check when index building has started
            if isAddIndexBusy:

                # check still the index_building process is happening
                if not threadHandle_index.is_alive():

                    logger.info("Merging the dataset shards")
                    saved_dataset_shards = []

                    for address in glob(
                            str(self.custom_config.shard_dir) + "/*/"):
                        saved_dataset_shards.append(load_from_disk(address))

                    concat = concatenate_datasets(saved_dataset_shards)
                    concat.save_to_disk(
                        self.config.passages_path
                    )  # here we update the main passage file on the disk
                    logger.info("done updating the dataset")

                    # if you load the index from the disk make sure to update the index file here, otherwise it is ok to update the index file from the worker.
                    # logger.info("then updating the index")
                    # shutil.copy(self.custom_config.temp_index, self.config.idex_path)

                    logger.info(
                        "Loading new passages and iniitalzing new index")
                    self.trainer.model.module.module.model.rag.retriever.re_load(
                    )
                    self.trainer.model.module.module.model.rag.retriever.init_retrieval(
                    )

                    isEmUpdateBusy = False
                    isAddIndexBusy = False

        self.trainer.accelerator_connector.accelerator.barrier(
            "barrier")  # waint untill the index and kb get re-initialized.

        loss_tensors = self._step(batch)

        logs = {
            name: loss
            for name, loss in zip(self.loss_names, loss_tensors)
        }
        # tokens per batch
        tgt_pad_token_id = (self.tokenizer.generator.pad_token_id
                            if isinstance(self.tokenizer, RagTokenizer) else
                            self.tokenizer.pad_token_id)
        src_pad_token_id = (self.tokenizer.question_encoder.pad_token_id
                            if isinstance(self.tokenizer, RagTokenizer) else
                            self.tokenizer.pad_token_id)
        logs["tpb"] = (batch["input_ids"].ne(src_pad_token_id).sum() +
                       batch["decoder_input_ids"].ne(tgt_pad_token_id).sum())
        self.log("loss", loss_tensors[0])
        return loss_tensors[0]
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. "
            "Use --overwrite_output_dir to overcome.")

    if not os.path.exists(training_args.output_dir):
        os.makedirs(training_args.output_dir)

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if is_main_process(training_args.local_rank) else logging.WARN,
    )

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        +
        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
    logger.info(f"Training/evaluation parameters {training_args}")

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below)
    # or specify a GLUE benchmark task (the dataset will be downloaded automatically from the datasets Hub).
    #
    # For CSV/JSON files, this script will use as labels the column called 'label' and as pair of sentences the
    # sentences in columns called 'sentence1' and 'sentence2' if such column exists or the first two columns not named
    # label if at least two columns are provided.
    #
    # If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this
    # single column. You can easily tweak this behavior (see below)
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if data_args.task_name is not None:
        # Downloading and loading a dataset from the hub.
        datasets = load_dataset("glue", data_args.task_name)
    elif data_args.train_file.endswith(".csv"):
        # Loading a dataset from local csv files
        datasets = load_dataset("csv",
                                data_files={
                                    "train": data_args.train_file,
                                    "validation": data_args.validation_file
                                })
    else:
        if not data_args.joint_training:
            # Loading a dataset from local json files
            datasets = load_dataset("json",
                                    data_files={
                                        "train": data_args.train_file,
                                        "validation": data_args.validation_file
                                    })
        else:
            # joint training
            datasets = {}
            for lang in MARC_LANGS:
                datasets[lang] = load_dataset(
                    "json",
                    data_files={
                        "train": data_args.train_file.format(lang),
                        "validation": data_args.validation_file.format(lang)
                    })

    # See more about loading any type of standard or custom dataset at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    # Labels
    if data_args.task_name is not None:
        is_regression = data_args.task_name == "stsb"
        if not is_regression:
            label_list = datasets["train"].features["label"].names
            num_labels = len(label_list)
        else:
            num_labels = 1
    else:
        # Trying to have good defaults here, don't hesitate to tweak to your needs.
        # if not data_args.joint_training:
        tmp = datasets["en"] if data_args.joint_training else datasets

        is_regression = tmp["train"].features[
            data_args.label_column_name].dtype in ["float32", "float64"]
        if is_regression:
            num_labels = 1
        else:
            # A useful fast method:
            # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique
            label_list = tmp["train"].unique(data_args.label_column_name)
            label_list.sort()  # Let's sort it for determinism
            num_labels = len(label_list)

    # Load pretrained model and tokenizer
    #
    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    # Preprocessing the datasets
    if data_args.task_name is not None:
        sentence1_key, sentence2_key = task_to_keys[data_args.task_name]
    else:
        # Again, we try to have some nice defaults but don't hesitate to tweak to your use case.
        tmp = datasets["en"] if data_args.joint_training else datasets
        non_label_column_names = [
            name for name in tmp["train"].column_names
            if name != data_args.label_column_name
        ]
        if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names:
            sentence1_key, sentence2_key = "sentence1", "sentence2"
        else:
            #### completely modify this condition to use MARC
            sentence1_key, sentence2_key = "review_body", None

            # if len(non_label_column_names) >= 2:
            #     sentence1_key, sentence2_key = non_label_column_names[:2]
            # else:
            #     sentence1_key, sentence2_key = non_label_column#### completely modify this condition to use MARC _names[0], None
            #### completely modify this condition to use MARC
    # Padding strategy
    if data_args.pad_to_max_length:
        padding = "max_length"
        max_length = data_args.max_seq_length
    else:
        # We will pad later, dynamically at batch creation, to the max sequence length in each batch
        padding = False
        max_length = None

    # idx2token = {v: k for k, v in tokenizer.vocab.items()}

    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=model_args.use_fast_tokenizer,
    )

    # Some models have set the order of the labels to use, so let's make sure we do use it.
    label_to_id = {v: i for i, v in enumerate(label_list)}

    # if (
    #         model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id
    #         and data_args.task_name is not None
    #         and is_regression
    # ):
    #     # Some have all caps in their config, some don't.
    #     label_name_to_id = {k.lower(): v for k, v in model.config.label2id.items()}
    #     if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)):
    #         label_to_id = {i: label_name_to_id[label_list[i]] for i in range(num_labels)}
    #     else:
    #         logger.warning(
    #             "Your model seems to have been trained with labels, but they don't match the dataset: ",
    #             f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}."
    #             "\nIgnoring the model labels as a result.",
    #         )
    # elif data_args.task_name is None:
    #     label_to_id = {v: i for i, v in enumerate(label_list)}

    def preprocess_function(examples):
        # Tokenize the texts
        args = ((examples[sentence1_key], ) if sentence2_key is None else
                (examples[sentence1_key], examples[sentence2_key]))
        ### change tokenizer language code ###
        lang = examples["language"][0]
        lang_code = LANG2LANG_CODE[lang]
        tokenizer.set_src_lang_special_tokens(lang_code)
        ### change tokenizer language code ###
        result = tokenizer(*args,
                           padding=padding,
                           max_length=max_length,
                           truncation=True)
        # Map labels to IDs (not necessary for GLUE tasks)
        if label_to_id is not None and data_args.label_column_name in examples:
            result["label"] = [
                label_to_id[l] for l in examples[data_args.label_column_name]
            ]
        return result

    if not data_args.joint_training:
        datasets = datasets.map(
            preprocess_function,
            batched=True,
            load_from_cache_file=not data_args.overwrite_cache)
    else:
        for lang in MARC_LANGS:
            datasets[lang] = datasets[lang].map(
                preprocess_function,
                batched=True,
                load_from_cache_file=not data_args.overwrite_cache)
        merged_train_datasets = datasets_module.concatenate_datasets(
            [dataset["train"] for dataset in datasets.values()])
        merged_dev_datasets = datasets_module.concatenate_datasets(
            [dataset["validation"] for dataset in datasets.values()])
        datasets = DatasetDict(train=merged_train_datasets,
                               validation=merged_dev_datasets)

    train_dataset = datasets["train"]
    eval_dataset = datasets["validation_matched" if data_args.task_name ==
                            "mnli" else "validation"]
    if data_args.task_name is not None:
        test_dataset = datasets["test_matched" if data_args.task_name ==
                                "mnli" else "test"]

    # Log a few random samples from the training set:
    for index in random.sample(range(len(train_dataset)), 3):
        logger.info(
            f"Sample {index} of the training set: {train_dataset[index]}.")

    # Get the metric function
    if data_args.task_name is not None:
        metric = load_metric("glue", data_args.task_name)
    # TODO: When datasets metrics include regular accuracy, make an else here and remove special branch from
    # compute_metrics

    # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
    # predictions and label_ids field) and has to return a dictionary string to float.
    def compute_metrics(p: EvalPrediction):
        # TODO: modify accordingly
        preds = p.predictions[0] if isinstance(p.predictions,
                                               tuple) else p.predictions
        preds = np.squeeze(preds) if is_regression else np.argmax(preds,
                                                                  axis=1)
        if data_args.task_name is not None:
            result = metric.compute(predictions=preds, references=p.label_ids)
            if len(result) > 1:
                result["combined_score"] = np.mean(list(
                    result.values())).item()
            return result
        elif is_regression:
            ### modify from MSE to MAE for MARC ###
            return {"mae": (np.abs(preds - p.label_ids)).mean().item()}
            ### modify from MSE to MAE for MARC ###
        else:
            return {
                "accuracy":
                (preds == p.label_ids).astype(np.float32).mean().item(),
                "mae": (np.abs(preds - p.label_ids)).mean().item()
            }

    NUM_LAYERS = data_args.num_layer
    LAYERS = list(range(2 * NUM_LAYERS))
    NUM_HEADS = data_args.num_head
    for layer in LAYERS:
        heads = range(
            2 * NUM_HEADS) if layer > NUM_LAYERS - 1 else range(NUM_HEADS)

        for head in heads:
            config = AutoConfig.from_pretrained(
                model_args.config_name
                if model_args.config_name else model_args.model_name_or_path,
                num_labels=num_labels,
                finetuning_task=data_args.task_name,
                cache_dir=model_args.cache_dir,
            )
            model = MBartForSequenceClassification.from_pretrained(
                model_args.model_name_or_path,
                from_tf=bool(".ckpt" in model_args.model_name_or_path),
                config=config,
                cache_dir=model_args.cache_dir,
            )
            # Initialize our Trainer
            trainer = Trainer(
                model=model,
                args=training_args,
                train_dataset=train_dataset,
                eval_dataset=eval_dataset if training_args.do_eval else None,
                compute_metrics=compute_metrics,
                tokenizer=tokenizer,
                # Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding.
                data_collator=default_data_collator
                if data_args.pad_to_max_length else None,
            )

            if data_args.do_prune:
                logger.info("*** Purturbing mBART ***")
                tasks = [data_args.task_name]
                eval_datasets = [eval_dataset]
                from ipdb import set_trace as bp
                # bp()
                model.prune_heads({layer: [head]})

            # Training
            if training_args.do_train:
                train_result = trainer.train(
                    model_path=model_args.model_name_or_path if os.path.
                    isdir(model_args.model_name_or_path) else None)
                metrics = train_result.metrics

                trainer.save_model()  # Saves the tokenizer too for easy upload

                output_train_file = os.path.join(training_args.output_dir,
                                                 "train_results.txt")
                if trainer.is_world_process_zero():
                    with open(output_train_file, "w") as writer:
                        logger.info("***** Train results *****")
                        for key, value in sorted(metrics.items()):
                            logger.info(f"  {key} = {value}")
                            writer.write(f"{key} = {value}\n")

                    # Need to save the state, since Trainer.save_model saves only the tokenizer with the model
                    trainer.state.save_to_json(
                        os.path.join(training_args.output_dir,
                                     "trainer_state.json"))

            # Evaluation
            eval_results = {}
            if training_args.do_eval:
                logger.info("*** Evaluate ***")

                # Loop to handle MNLI double evaluation (matched, mis-matched)
                tasks = [data_args.task_name]
                eval_datasets = [eval_dataset]
                if data_args.task_name == "mnli":
                    tasks.append("mnli-mm")
                    eval_datasets.append(datasets["validation_mismatched"])
                lang_head_specific_folder = os.path.join(
                    training_args.output_dir, data_args.language,
                    f"layer{layer}_head{head}")
                if not os.path.exists(lang_head_specific_folder):
                    os.makedirs(lang_head_specific_folder)

                for eval_dataset, task in zip(eval_datasets, tasks):
                    eval_result = trainer.evaluate(eval_dataset=eval_dataset)

                    output_eval_file = os.path.join(
                        lang_head_specific_folder, f"eval_results_{task}.txt")
                    if trainer.is_world_process_zero():
                        with open(output_eval_file, "w") as writer:
                            logger.info(f"***** Eval results {task} *****")
                            for key, value in sorted(eval_result.items()):
                                logger.info(f"  {key} = {value}")
                                writer.write(f"{key} = {value}\n")
                    output_eval_file = os.path.join(
                        lang_head_specific_folder, f"eval_results_{task}.json")

                    if trainer.is_world_process_zero():
                        with open(output_eval_file, "w") as writer:
                            import json
                            json.dump(eval_result, writer)

                    eval_results.update(eval_result)

            if training_args.do_predict:
                logger.info("*** Test ***")

                # Loop to handle MNLI double evaluation (matched, mis-matched)
                tasks = [data_args.task_name]
                test_datasets = [test_dataset]
                if data_args.task_name == "mnli":
                    tasks.append("mnli-mm")
                    test_datasets.append(datasets["test_mismatched"])

                for test_dataset, task in zip(test_datasets, tasks):
                    # Removing the `label` columns because it contains -1 and Trainer won't like that.
                    test_dataset.remove_columns_("label")
                    predictions = trainer.predict(
                        test_dataset=test_dataset).predictions
                    predictions = np.squeeze(
                        predictions) if is_regression else np.argmax(
                            predictions, axis=1)

                    output_test_file = os.path.join(
                        training_args.output_dir, f"test_results_{task}.txt")
                    if trainer.is_world_process_zero():
                        with open(output_test_file, "w") as writer:
                            logger.info(f"***** Test results {task} *****")
                            writer.write("index\tprediction\n")
                            for index, item in enumerate(predictions):
                                if is_regression:
                                    writer.write(f"{index}\t{item:3.3f}\n")
                                else:
                                    item = label_list[item]
                                    writer.write(f"{index}\t{item}\n")
Exemple #30
0
        examples['n_real'] = [sum([0 if cls.tokenizer.convert_ids_to_tokens(i).startswith('##') 
                            else 1 for i in line]) - 2 for line in examples['input_ids']]
        return examples


if __name__ == '__main__':
    from utils import get_tokenizer
    from copy import deepcopy

    t=get_tokenizer('bert-base-chinese', is_zh=True)
    ds = get_tokenized_ds('hfds_scripts/atec_dataset.py', '../sentence-embedding/data/ATEC/atec_nlp_sim_train.csv', t, tokenize_type='with_prefix')

    ds = ds['atec']
    ds2=deepcopy(ds)

    for index, ds_ in enumerate([ds, ds2]):
        features=list(ds_.features)
        for feature in features:
            if index:
                if feature.startswith('textb') or feature == 'label':
                    ds_.remove_columns_(feature)
                else:
                    ds_.rename_column_(feature, feature[6:])
            else:
                if feature.startswith('texta') or feature == 'label':
                    ds_.remove_columns_(feature)
                else:
                    ds_.rename_column_(feature, feature[6:])
    
    ds=concatenate_datasets([ds, ds2])
    print(ds)