def load_datasets(lang="es", random_state=2021, preprocessing_args={}):
    """
    Load emotion recognition datasets
    """

    train_df = load_df(paths[lang]["train"])
    test_df = load_df(paths[lang]["test"])
    train_df, dev_df = train_test_split(train_df,
                                        stratify=train_df["label"],
                                        random_state=random_state)

    for df in [train_df, dev_df, test_df]:
        for label, idx in label2id.items():
            df.loc[df["label"] == label, "label"] = idx
        df["label"] = df["label"].astype(int)

    preprocess = lambda x: preprocess_tweet(x, lang=lang, **preprocessing_args)

    train_df.loc[:, "text"] = train_df["text"].apply(preprocess)
    dev_df.loc[:, "text"] = dev_df["text"].apply(preprocess)
    test_df.loc[:, "text"] = test_df["text"].apply(preprocess)

    features = Features({
        'text':
        Value('string'),
        'label':
        ClassLabel(num_classes=len(id2label),
                   names=[id2label[k] for k in sorted(id2label.keys())])
    })

    train_dataset = Dataset.from_pandas(train_df, features=features)
    dev_dataset = Dataset.from_pandas(dev_df, features=features)
    test_dataset = Dataset.from_pandas(test_df, features=features)

    return train_dataset, dev_dataset, test_dataset
Beispiel #2
0
def concatenate_datasets_with_ratio(args, train_dataset):
    concatenate_list = []

    for sub_dataset_name, ratio in zip(
            args.data.sub_datasets.split(","),
            args.data.sub_datasets_ratio.split(",")):
        ratio = float(ratio)
        sub_dataset_path = p.join(args.path.train_data_dir, sub_dataset_name)
        assert p.exists(sub_dataset_path), f"{sub_dataset_name}이 존재하지 않습니다."

        sub_dataset = load_from_disk(sub_dataset_path)
        sub_dataset_len = int(len(sub_dataset["train"]) * ratio)

        print(f"ADD SUB DATASET {sub_dataset_name}, LENGTH: {sub_dataset_len}")

        # sub dataset must have same features: ['id', 'title', 'context', 'question', 'answers']
        features = sub_dataset["train"].features

        new_sub_dataset = sub_dataset["train"].select(range(sub_dataset_len))
        new_sub_dataset = Dataset.from_pandas(new_sub_dataset.to_pandas(),
                                              features=features)

        concatenate_list.append(new_sub_dataset.flatten_indices())

    train_dataset = Dataset.from_pandas(train_dataset.to_pandas(),
                                        features=features)
    train_dataset = concatenate_datasets([train_dataset.flatten_indices()] +
                                         concatenate_list)

    return train_dataset
def save_data(train_df, val_df):
    train_f = Features({
        'answers':
        Sequence(feature={
            'text': Value(dtype='string', id=None),
            'answer_start': Value(dtype='int32', id=None)
        },
                 length=-1,
                 id=None),
        'context':
        Value(dtype='string', id=None),
        'id':
        Value(dtype='string', id=None),
        'question':
        Value(dtype='string', id=None),
        'question_type':
        Value(dtype='int32', id=None)
    })

    train_datasets = DatasetDict({
        'train':
        Dataset.from_pandas(train_df, features=train_f),
        'validation':
        Dataset.from_pandas(val_df, features=train_f)
    })
    file = open("../../data/question_type.pkl", "wb")
    pickle.dump(train_datasets, file)
    file.close()
    def load_domain_split_dataset(self, data_dir, logger=None):
        """
        Loads break dataset with domain split. Train - on text. val + test - on DB + images
        :param data_dir:    The path of the directory where the preprocessed dataset should be saved to or loaded from.
        :param logger:      A logger for logging events.
        :return:            The loaded dataset.
        """
        current_dir = Path()
        dir_path = current_dir / "data" / "break_data" / "preprocessed"
        file_name = "dataset_preprocessed_domain_split.pkl"
        if not (dir_path / file_name).is_file():
            if logger:
                logger.info('Creating domain split dataset...')
            text_domain_dataset_prefixes = ('COMQA', 'CWQ', 'DROP', 'HOTP')
            image_domain_dataset_prefixes = ('CLEVR', 'NLVR2')
            DB_domain_dataset_prefixes = ('ACADEMIC', 'ATIS', 'GEO', 'SPIDER')
            image_plus_DB = image_domain_dataset_prefixes + DB_domain_dataset_prefixes
            train_filtererd = pd.DataFrame()
            validation_filtererd = pd.DataFrame()
            test_filtererd = pd.DataFrame()

            for i, example in enumerate(self.dataset_logical['train']):
                if example['question_id'].startswith(
                        text_domain_dataset_prefixes):
                    train_filtererd = train_filtererd.append(example,
                                                             ignore_index=True)
            for i, example in enumerate(self.dataset_logical['validation']):
                if example['question_id'].startswith(image_plus_DB):
                    validation_filtererd = validation_filtererd.append(
                        example, ignore_index=True)
            for i, example in enumerate(self.dataset_logical['test']):
                if example['question_id'].startswith(image_plus_DB):
                    test_filtererd = test_filtererd.append(example,
                                                           ignore_index=True)

            # TODO delete this?
            # train_dataset = self.dataset_logical['train'].filter(
            #     lambda example: example['question_id'].startswith(text_domain_dataset_prefixes))
            # validation_dataset = self.dataset_logical['validation'].filter(
            #     lambda example: example['question_id'].startswith(image_plus_DB))
            # test_dataset = self.dataset_logical['test'].filter(
            #     lambda example: example['question_id'].startswith(image_plus_DB))
            # train_filtererd_ds = Dataset.from_pandas(train_filtererd)
            to_save = {
                'train': Dataset.from_pandas(train_filtererd),
                'validation': Dataset.from_pandas(validation_filtererd),
                'test': Dataset.from_pandas(test_filtererd)
            }
            save_obj(dir_path, to_save, file_name)

        dataset = load_obj(dir_path, file_name)
        return dataset
Beispiel #5
0
def clean_datasets():
    config = read_config()
    if config['kaggle']:
        trainset, testset = get_datasets("../input/commonlitreadabilityprize/train.csv",
                                 discard = ["url_legal","license"])
        trainset = trainset.rename(columns = {'target': 'labels', 'excerpt': 'text'})
        testset = testset.rename(columns = {'target': 'labels', 'excerpt': 'text'})
    else:
        trainset, testset = get_datasets(config['dataset']['filename'],  
                                 discard = config['dataset']['discard'])

    trainset = Dataset.from_pandas(trainset)
    testset = Dataset.from_pandas(testset)

    return trainset, testset
Beispiel #6
0
def load_hf_dataset(data, tokenizer, args):
    if isinstance(data, str):
        dataset = load_dataset(
            "csv",
            data_files=data,
            delimiter="\t",
            download_mode="force_redownload"
            if args.reprocess_input_data else "reuse_dataset_if_exists",
        )
    else:
        dataset = HFDataset.from_pandas(data)

    dataset = dataset.map(
        lambda x: preprocess_batch_for_hf_dataset(
            x, tokenizer=tokenizer, args=args),
        batched=True,
    )

    dataset.set_format(type="pt", columns=["input_ids", "attention_mask"])

    if isinstance(data, str):
        # This is not necessarily a train dataset. The datasets library insists on calling it train.
        return dataset["train"]
    else:
        return dataset
Beispiel #7
0
def load_dataset(dataset: str = "ChnSentiCorp", split: str = "train"):
    df = pd.read_csv(f"/data/{dataset}_{split}.tsv", sep="\t")
    ds = Dataset.from_pandas(df)
    ds.features["label"].num_classes = 2
    ds.features["label"].names = ["pos", "neg"]

    return ds
def test_from_hf_datasets_multilabel():
    TEST_HF_DATASET_DATA_MULTILABEL = Dataset.from_pandas(TEST_DATA_FRAME_DATA_MULTILABEL)
    dm = TextClassificationData.from_hf_datasets(
        "sentence",
        ["lab1", "lab2"],
        train_hf_dataset=TEST_HF_DATASET_DATA_MULTILABEL,
        val_hf_dataset=TEST_HF_DATASET_DATA_MULTILABEL,
        test_hf_dataset=TEST_HF_DATASET_DATA_MULTILABEL,
        predict_hf_dataset=TEST_HF_DATASET_DATA_MULTILABEL,
        batch_size=1,
    )

    assert dm.multi_label

    batch = next(iter(dm.train_dataloader()))
    assert all([label in [0, 1] for label in batch[DataKeys.TARGET][0]])
    assert isinstance(batch[DataKeys.INPUT][0], str)

    batch = next(iter(dm.val_dataloader()))
    assert all([label in [0, 1] for label in batch[DataKeys.TARGET][0]])
    assert isinstance(batch[DataKeys.INPUT][0], str)

    batch = next(iter(dm.test_dataloader()))
    assert all([label in [0, 1] for label in batch[DataKeys.TARGET][0]])
    assert isinstance(batch[DataKeys.INPUT][0], str)

    batch = next(iter(dm.predict_dataloader()))
    assert isinstance(batch[DataKeys.INPUT][0], str)
def build_dataset(df, tokenizer, batch_size):
    features = Features({
        'id': Value('uint64'),
        'context': Value('string'),
        'text': Value('string'),
    })

    dataset = Dataset.from_pandas(df, features=features)

    dataset = dataset.map(
        lambda x: tokenizer(x["text"],
                            x["context"],
                            padding="longest",
                            truncation='longest_first'),
        batched=True,
        batch_size=batch_size,
    )

    def format_dataset(dataset):
        dataset.set_format(
            type='torch',
            columns=['input_ids', 'token_type_ids', 'attention_mask'])
        return dataset

    dataset = format_dataset(dataset)

    return dataset
Beispiel #10
0
def getDataset(config):
    """
    build dataset from the h5 file
    """
    atu = pd.read_hdf(config.data.h5_file, key=config.data.h5_key)
    atu = atu[["text", "atu", "desc"]]

    dataset = Dataset.from_pandas(atu)
    tokenizer = AutoTokenizer.from_pretrained(config["module"]["arch"])

    def tokenize(instance):
        return tokenizer(instance["text"],
                         max_length=config["module"]["seq_len"],
                         truncation=True,
                         padding=True)

    dataset = dataset. \
        shuffle(seed=config.seed). \
        map(tokenize, batched=True)

    dataset.set_format(
        type="numpy",
        columns=['input_ids', 'attention_mask', "atu", "desc", "text"])

    return dataset
Beispiel #11
0
def parse_test_key(test: str, key: str):
    tweets = []
    file1 = open(test, 'r')
    file2 = open(key, 'r')

    file1.readline()
    file2.readline()
    while True:
        line1 = file1.readline()
        line2 = file2.readline()

        if not line1 or not line2:
            break
        split_line1 = line1.split('\t')
        split_line2 = line2.split('\t')

        if split_line2[1].strip() == 'oth':
            tweets.append([split_line1[0], detweetify(split_line1[1]), 4])
        elif split_line2[1].strip() == 'grp':
            tweets.append([split_line1[0], detweetify(split_line1[1]), 3])
        elif split_line2[1].strip() == 'ind':
            tweets.append([split_line1[0], detweetify(split_line1[1]), 2])
        elif split_line2[1].strip() == 'prof':
            tweets.append([split_line1[0], detweetify(split_line1[1]), 1])
        else:
            tweets.append([split_line1[0], detweetify(split_line1[1]), 0])

    return Dataset.from_pandas(
        pd.DataFrame(tweets, columns=['id', 'tweet', 'labels']))
def test_from_hf_datasets():
    TEST_HF_DATASET_DATA = Dataset.from_pandas(TEST_DATA_FRAME_DATA)
    dm = TextClassificationData.from_hf_datasets(
        "sentence",
        "lab1",
        train_hf_dataset=TEST_HF_DATASET_DATA,
        val_hf_dataset=TEST_HF_DATASET_DATA,
        test_hf_dataset=TEST_HF_DATASET_DATA,
        predict_hf_dataset=TEST_HF_DATASET_DATA,
        batch_size=1,
    )

    batch = next(iter(dm.train_dataloader()))
    assert batch[DataKeys.TARGET].item() in [0, 1]
    assert isinstance(batch[DataKeys.INPUT][0], str)

    batch = next(iter(dm.val_dataloader()))
    assert batch[DataKeys.TARGET].item() in [0, 1]
    assert isinstance(batch[DataKeys.INPUT][0], str)

    batch = next(iter(dm.test_dataloader()))
    assert batch[DataKeys.TARGET].item() in [0, 1]
    assert isinstance(batch[DataKeys.INPUT][0], str)

    batch = next(iter(dm.predict_dataloader()))
    assert isinstance(batch[DataKeys.INPUT][0], str)
Beispiel #13
0
def load_hf_dataset(data, tokenizer, args, multi_label):
    if isinstance(data, str):
        dataset = load_dataset("csv", data_files=data, delimiter="\t")
    else:
        dataset = HFDataset.from_pandas(data)

    if args.labels_map and not args.regression:
        dataset = dataset.map(
            lambda x: map_labels_to_numeric(x, multi_label, args))

    dataset = dataset.map(
        lambda x: preprocess_batch_for_hf_dataset(
            x, tokenizer=tokenizer, max_seq_length=args.max_seq_length),
        batched=True,
    )

    if args.model_type in ["bert", "xlnet", "albert", "layoutlm"]:
        dataset.set_format(type="pt",
                           columns=[
                               "input_ids", "token_type_ids", "attention_mask",
                               "labels"
                           ])
    else:
        dataset.set_format(type="pt",
                           columns=["input_ids", "attention_mask", "labels"])

    if isinstance(data, str):
        # This is not necessarily a train dataset. The datasets library insists on calling it train.
        return dataset["train"]
    else:
        return dataset
Beispiel #14
0
def load_datasets(preprocess_args={}):
    """
    Return train, dev, test datasets
    """
    train_files = glob(os.path.join(tass_dir, "train/*.tsv"))
    dev_files = glob(os.path.join(tass_dir, "dev/*.tsv"))
    test_files = glob(os.path.join(tass_dir, "test1.1/*.tsv"))

    train_dfs = {get_lang(file): load_df(file) for file in train_files}
    dev_dfs = {get_lang(file): load_df(file) for file in dev_files}
    test_dfs = {
        get_lang(file): load_df(file, test=True)
        for file in test_files
    }

    train_df = pd.concat(train_dfs.values())
    dev_df = pd.concat(dev_dfs.values())
    test_df = pd.concat(test_dfs.values())

    print(len(train_df), len(dev_df), len(test_df))
    """
    Tokenize tweets
    """

    preprocess_with_args = lambda x: preprocess_tweet(x, **preprocess_args)

    train_df["text"] = train_df["text"].apply(preprocess_with_args)
    dev_df["text"] = dev_df["text"].apply(preprocess_with_args)
    test_df["text"] = test_df["text"].apply(preprocess_with_args)

    features = Features({
        'text':
        Value('string'),
        'lang':
        Value('string'),
        'label':
        ClassLabel(num_classes=3, names=["neg", "neu", "pos"])
    })

    columns = ["text", "lang", "label"]

    train_dataset = Dataset.from_pandas(train_df[columns], features=features)
    dev_dataset = Dataset.from_pandas(dev_df[columns], features=features)
    test_dataset = Dataset.from_pandas(test_df[columns], features=features)

    return train_dataset, dev_dataset, test_dataset
Beispiel #15
0
 def gen_fake_data():
     _data = {'text': ['今天是星期四<SIMSEP>今天是周四','今天是星期四<SIMSEP>今天是周四','今天是星期四<SIMSEP>今天是周四','今天是星期四<SIMSEP>今天是周四',\
             '今天是星期四<SIMSEP>今天是周四','今天是星期四<SIMSEP>今天是周四','今天是星期四<SIMSEP>今天是周四','今天是星期四<SIMSEP>今天是周四',\
             '今天是星期四<SIMSEP>今天是周四','今天是星期四<SIMSEP>今天是周四','今天是星期四<SIMSEP>今天是周四','今天是星期四<SIMSEP>今天是周四',\
             '今天是星期四<SIMSEP>今天是周四','今天是星期四<SIMSEP>今天是周四','今天是星期四<SIMSEP>今天是周四','今天是星期四<SIMSEP>今天是周四']}
     _df = pd.DataFrame(_data)
     _dataset = Dataset.from_pandas(_df)
     return _dataset
Beispiel #16
0
    def read_txt(txt_path):
        data = pd.read_csv(txt_path, delimiter='\n', header=None, names=['path', 'sentence'])
        
        has_colon = data['path'].str.contains('|')
        data[['path', 'sentence']] = data.loc[has_colon, 'path'].str.split('|', expand=True)

        data = Dataset.from_pandas(data)
        return(data)
Beispiel #17
0
def get_etr_dataset(args):
    etr_path = p.join(args.path.train_data_dir, "etr_qa_dataset.json")

    if not p.exists(etr_path):
        raise FileNotFoundError(
            f"ETRI 데이터 셋 {etr_path}로 파일명 바꿔서 데이터 넣어주시길 바랍니다.")

    with open(etr_path, "r") as f:
        etr_dict = json.load(f)

    #  print(etr_dict["data"][0])
    new_dataset = defaultdict(list)

    cnt = 0

    for datas in etr_dict["data"]:
        title = datas["title"]
        context = datas["paragraphs"][0]["context"]

        for questions in datas["paragraphs"][0]["qas"]:
            question = questions["question"]
            answers = {
                "answer_start": [questions["answers"][0]["answer_start"]],
                "text": [questions["answers"][0]["text"]],
            }

            new_dataset["id"].append(f"etr-custom-{cnt}")
            new_dataset["title"].append(title)
            new_dataset["context"].append(context)
            new_dataset["question"].append(question)
            new_dataset["answers"].append(answers)

            cnt += 1

    f = Features({
        "answers":
        Sequence(
            feature={
                "text": Value(dtype="string", id=None),
                "answer_start": Value(dtype="int32", id=None)
            },
            length=-1,
            id=None,
        ),
        "id":
        Value(dtype="string", id=None),
        "context":
        Value(dtype="string", id=None),
        "question":
        Value(dtype="string", id=None),
        "title":
        Value(dtype="string", id=None),
    })

    df = pd.DataFrame(new_dataset)
    etr_dataset = Dataset.from_pandas(df, features=f)

    return etr_dataset
    def load_length_split_dataset(self, data_dir, logger=None):
        """
        Loads break dataset with length split based on number of operators.
        Train - <= 4 steps.
        val + test - on DB + images
        :param data_dir: The path of the directory where the preprocessed dataset should be saved to or loaded from.
        :param logger: A logger for logging events.
        :return: The loaded dataset.
        """
        # TODO datadir required in signature?
        current_dir = Path()
        dir_path = current_dir / "data" / "break_data" / "preprocessed"
        file_name = "dataset_preprocessed_length_split.pkl"

        if not (dir_path / file_name).is_file():
            if logger:
                logger.info('Creating length split dataset...')
            threshold_amount_ops = 4

            train_filtererd = pd.DataFrame()
            validation_filtererd = pd.DataFrame()
            test_filtererd = pd.DataFrame()

            for i, example in enumerate(self.dataset_logical['train']):
                if example['operators'].count(',') < threshold_amount_ops:
                    train_filtererd = train_filtererd.append(example,
                                                             ignore_index=True)
            for i, example in enumerate(self.dataset_logical['validation']):
                if example['operators'].count(',') >= threshold_amount_ops:
                    validation_filtererd = validation_filtererd.append(
                        example, ignore_index=True)
            for i, example in enumerate(self.dataset_logical['test']):
                if example['operators'].count(',') >= threshold_amount_ops:
                    test_filtererd = test_filtererd.append(example,
                                                           ignore_index=True)

            to_save = {
                'train': Dataset.from_pandas(train_filtererd),
                'validation': Dataset.from_pandas(validation_filtererd),
                'test': Dataset.from_pandas(test_filtererd)
            }
            save_obj(dir_path, to_save, file_name)

        dataset = load_obj(dir_path, file_name)
        return dataset
def update_metadata(token, commit_sha):
    """
    Update the metada for the Transformers repo.
    """
    with tempfile.TemporaryDirectory() as tmp_dir:
        repo = Repository(tmp_dir,
                          clone_from="huggingface/transformers-metadata",
                          repo_type="dataset",
                          use_auth_token=token)

        frameworks_table = get_frameworks_table()
        frameworks_dataset = Dataset.from_pandas(frameworks_table)
        frameworks_dataset.to_json(os.path.join(tmp_dir, "frameworks.json"))

        tags_dataset = Dataset.from_json(
            os.path.join(tmp_dir, "pipeline_tags.json"))
        table = {
            tags_dataset[i]["model_class"]:
            (tags_dataset[i]["pipeline_tag"], tags_dataset[i]["auto_class"])
            for i in range(len(tags_dataset))
        }
        table = update_pipeline_and_auto_class_table(table)

        # Sort the model classes to avoid some nondeterministic updates to create false update commits.
        model_classes = sorted(list(table.keys()))
        tags_table = pd.DataFrame({
            "model_class":
            model_classes,
            "pipeline_tag": [table[m][0] for m in model_classes],
            "auto_class": [table[m][1] for m in model_classes],
        })
        tags_dataset = Dataset.from_pandas(tags_table)
        tags_dataset.to_json(os.path.join(tmp_dir, "pipeline_tags.json"))

        if repo.is_repo_clean():
            print("Nothing to commit!")
        else:
            if commit_sha is not None:
                commit_message = (
                    f"Update with commit {commit_sha}\n\nSee: "
                    f"https://github.com/huggingface/transformers/commit/{commit_sha}"
                )
            else:
                commit_message = "Update"
            repo.push_to_hub(commit_message)
Beispiel #20
0
def load_custom_dataset_commonvoice_format(path, split, path_column='path'):
    # TODO: add support for multiple split to be together. Example: train+validation
    dataset_path = Path(path) / (split + '.tsv')
    df = pd.read_csv(dataset_path, sep='\t')
    df[path_column] = [
        str((Path(path) / p).absolute())
        for _, p in df[path_column].iteritems()
    ]
    return Dataset.from_pandas(df)
    def init_dataset(self, X, y):
        dataset = Dataset.from_pandas(
            pd.DataFrame({
                'text':
                X,
                'label': [self.idx_to_label.index(lbl_str) for lbl_str in y]
            }))

        return dataset.map(self._tokenize_function, batched=True)
def make_negative_dataset(args,
                          bm25,
                          queries,
                          answers,
                          contexts,
                          name,
                          num=16):
    total = []
    scores, indices = bm25.get_relevant_doc_bulk(queries, topk=num * 2)

    answers, indices = np.array(answers, dtype="object"), np.array(indices)
    contexts = np.array(contexts, dtype="object")

    for idx, query in enumerate(queries):
        label = idx % num

        answer = answers[idx]
        context_list = contexts[indices[idx]]

        check_in = np.argwhere(context_list == answer)

        if check_in.shape[0] == 0:
            context_list[label] = answer
            context_list = context_list[:num]
        else:
            context_list[check_in[0][0]] = context_list[num]
            context_list[label] = answer
            context_list = context_list[:num]

        if idx % 100 == 0:
            print("query: ", query)
            print("answer: ", answer)
            print("negative:", context_list)
            print("label:", label)

        tmp = {
            "query": query,
            "negative_samples": context_list,
            "label": label
        }

        total.append(tmp)

    df = pd.DataFrame(total)

    f = Features({
        "query":
        Value(dtype="string", id=None),
        "negative_samples":
        Sequence(feature=Value(dtype="string", id=None), length=-1, id=None),
        "label":
        Value(dtype="int32", id=None),
    })

    dataset = Dataset.from_pandas(df, features=f)
    dataset.save_to_disk(os.path.join(args.path.train_data_dir, name))
Beispiel #23
0
def parse_training(file_location: str):
    tweets = []
    file = open(file_location, 'r')
    lines = file.readlines()

    for line in lines[1:]:
        split_line = line.split('\t')
        tweets.append([split_line[0], detweetify(split_line[1]), int(split_line[2].strip() == 'OFF')])

    return Dataset.from_pandas(pd.DataFrame(tweets, columns=['id', 'tweet', 'labels']))
Beispiel #24
0
 def load_eval_data(self, force_reload=False, save_datasets=True) -> None:
     eval_save_dir = self.save_dir / "eval"
     try:
         if force_reload:
             raise Exception()
         self.datasets["eval"] = DatasetDict.load_from_disk(eval_save_dir)
         print("Evaluation data loaded from disk.")
     except:
         print("Regenerating evaluation data.")
         eval_df_dict = self._parse_eval_data(self.eval_dir)
         self.datasets["eval"] = DatasetDict({
             "far":
             Dataset.from_pandas(eval_df_dict["far"]),
             "obj":
             Dataset.from_pandas(eval_df_dict["obj"]),
         })
         if save_datasets:
             print(f"Saving evaluation dataset to {eval_save_dir}")
             self.datasets["eval"].save_to_disk(eval_save_dir)
Beispiel #25
0
def generate_faiss_index_dataset(data, ctx_encoder_name, args, device):
    """
    Adapted from Huggingface example script at https://github.com/huggingface/transformers/blob/master/examples/research_projects/rag/use_own_knowledge_dataset.py
    """
    import faiss

    if isinstance(data, str):
        dataset = load_dataset("csv",
                               data_files=data,
                               delimiter="\t",
                               column_names=["title", "text"])
    else:
        dataset = HFDataset.from_pandas(data)

    dataset = dataset.map(
        partial(split_documents,
                split_text_n=args.split_text_n,
                split_text_character=args.split_text_character),
        batched=True,
        num_proc=args.process_count,
    )

    ctx_encoder = DPRContextEncoder.from_pretrained(ctx_encoder_name).to(
        device=device)
    ctx_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained(
        ctx_encoder_name)

    new_features = Features({
        "text": Value("string"),
        "title": Value("string"),
        "embeddings": Sequence(Value("float32"))
    })  # optional, save as float32 instead of float64 to save space
    dataset = dataset.map(
        partial(embed,
                ctx_encoder=ctx_encoder,
                ctx_tokenizer=ctx_tokenizer,
                device=device),
        batched=True,
        batch_size=args.rag_embed_batch_size,
        features=new_features,
    )
    if isinstance(data, str):
        dataset = dataset["train"]

    if args.save_knowledge_dataset:
        output_dataset_directory = os.path.join(args.output_dir,
                                                "knowledge_dataset")
        os.makedirs(output_dataset_directory, exist_ok=True)
        dataset.save_to_disk(output_dataset_directory)

    index = faiss.IndexHNSWFlat(args.faiss_d, args.faiss_m,
                                faiss.METRIC_INNER_PRODUCT)
    dataset.add_faiss_index("embeddings", custom_index=index)

    return dataset
Beispiel #26
0
 def load_data(
     self,
     data_frame: pd.DataFrame,
     input_key: str,
     target_keys: Optional[Union[str, List[str]]] = None,
     target_formatter: Optional[TargetFormatter] = None,
 ) -> Dataset:
     return super().load_data(Dataset.from_pandas(data_frame),
                              input_key,
                              target_keys,
                              target_formatter=target_formatter)
Beispiel #27
0
def load_dt_data(data):
    data_df = pd.DataFrame(data, columns=['path', 'sentence'])
    data = Dataset.from_pandas(data_df)

    CHARS_TO_IGNORE = r'[\,\?\.\!\-\;\:\"\“\%\‘\”\�\$\©\~\)\(\§\'\d]'

    def remove_special_characters(batch):
        batch["sentence"] = re.sub(CHARS_TO_IGNORE, '',
                                   batch["sentence"]).lower() + " "
        return batch

    def dt_speech_file_to_array_fn(batch):
        speech_array, sampling_rate = torchaudio.load(batch["path"])
        batch["speech"] = speech_array[0].numpy()
        batch["sampling_rate"] = sampling_rate
        batch["target_text"] = batch["sentence"]
        return batch

    def resample(batch):
        batch["speech"] = librosa.resample(np.asarray(batch["speech"]), 22_050,
                                           16_000)
        batch["sampling_rate"] = 16_000
        return batch

    # print(data)
    data = data.map(remove_special_characters)
    data = data.map(dt_speech_file_to_array_fn,
                    remove_columns=data.column_names)
    data = data.map(resample, num_proc=4)

    # processor = create_processor()

    def prepare_dataset(batch):
        # check that all files have the correct sampling rate
        assert (
            len(set(batch["sampling_rate"])) == 1
        ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}."

        batch["input_values"] = processor(
            batch["speech"],
            sampling_rate=batch["sampling_rate"][0]).input_values

        with processor.as_target_processor():
            batch["labels"] = processor(batch["target_text"]).input_ids
        return batch

    data = data.map(prepare_dataset,
                    remove_columns=data.column_names,
                    batch_size=4,
                    num_proc=4,
                    batched=True)

    return data
def load_datasets(seed=2021, preprocessing_args={}):
    """
    Return train, dev, test datasets
    """
    train_df = load_df(os.path.join(semeval_dir, "train.csv"))

    test_df = load_df(os.path.join(semeval_dir, "test.csv"))

    train_df, dev_df = train_test_split(train_df, test_size=0.2)

    print(len(train_df), len(dev_df), len(test_df))
    """
    Tokenize tweets
    """

    en_preprocess = lambda x: preprocess_tweet(
        x, lang="en", **preprocessing_args)

    train_df["text"] = train_df["text"].apply(en_preprocess)
    dev_df["text"] = dev_df["text"].apply(en_preprocess)
    test_df["text"] = test_df["text"].apply(en_preprocess)

    features = Features({
        'id':
        Value('int64'),
        'text':
        Value('string'),
        'label':
        ClassLabel(num_classes=3, names=["NEG", "NEU", "POS"])
    })

    columns = ["text", "id", "label"]

    train_dataset = Dataset.from_pandas(train_df[columns], features=features)
    dev_dataset = Dataset.from_pandas(dev_df[columns], features=features)
    test_dataset = Dataset.from_pandas(test_df[columns], features=features)

    return train_dataset, dev_dataset, test_dataset
Beispiel #29
0
def run_sparse_retrieval(datasets, training_args):
    #### retreival process ####

    retriever = SparseRetrieval(tokenize_fn=tokenize,
                                data_path="./data",
                                context_path="wikipedia_documents.json"
                                # context_path="all_wikipedia_documents.json"
                                )
    # sparse embedding retrieval
    # retriever.get_sparse_embedding()
    #df = retriever.retrieve(datasets['validation'])

    # bm25 retrieval
    # retriever.get_embedding_BM25()
    # df = retriever.retrieve_BM25(query_or_dataset=datasets['validation'], topk=10)

    # elastic search retrieval
    # retriever.get_elastic_search()
    df = retriever.retrieve_ES(query_or_dataset=datasets['validation'],
                               topk=10)

    # faiss retrieval
    # df = retriever.retrieve_faiss(dataset['validation'])

    if training_args.do_predict:  # test data 에 대해선 정답이 없으므로 id question context 로만 데이터셋이 구성됩니다.
        f = Features({
            'context': Value(dtype='string', id=None),
            'id': Value(dtype='string', id=None),
            'question': Value(dtype='string', id=None)
        })

    elif training_args.do_eval:  # train data 에 대해선 정답이 존재하므로 id question context answer 로 데이터셋이 구성됩니다.
        f = Features({
            'answers':
            Sequence(feature={
                'text': Value(dtype='string', id=None),
                'answer_start': Value(dtype='int32', id=None)
            },
                     length=-1,
                     id=None),
            'context':
            Value(dtype='string', id=None),
            'id':
            Value(dtype='string', id=None),
            'question':
            Value(dtype='string', id=None)
        })

    datasets = DatasetDict({'validation': Dataset.from_pandas(df, features=f)})
    return datasets
Beispiel #30
0
def load_hf_dataset(data, encoder_tokenizer, decoder_tokenizer, args):
    if isinstance(data, str):
        dataset = load_dataset(
            "csv",
            data_files=data,
            delimiter="\t",
            download_mode="force_redownload"
            if args.reprocess_input_data else "reuse_dataset_if_exists",
            cache_dir=args.dataset_cache_dir,
        )
    else:
        dataset = HFDataset.from_pandas(data)

    dataset = dataset.map(
        lambda x: preprocess_batch_for_hf_dataset(
            x,
            encoder_tokenizer=encoder_tokenizer,
            decoder_tokenizer=decoder_tokenizer,
            args=args,
        ),
        batched=True,
    )

    if args.model_type == "bart":
        column_names = [
            "source_ids",
            "source_mask",
            "target_ids",
        ]
    elif args.model_type == "mbart":
        column_names = [
            "input_ids",
            "attention_mask",
            "decoder_input_ids",
            "labels",
        ]
    else:
        column_names = [
            "input_ids",
            "attention_mask",
            "decoder_input_ids",
        ]

    dataset.set_format(type="pt", columns=column_names)

    if isinstance(data, str):
        # This is not necessarily a train dataset. The datasets library insists on calling it train.
        return dataset["train"]
    else:
        return dataset