Example #1
0
def build_dataset(data_dir, task_name, tokenizer, max_length=512, **kwargs):
    dataset = datasets.load_dataset(
        datasets.Bio,
        data_dir=data_dir,
        cache_dir=data_dir,
        data_files=datasets.Bio.default_files(data_dir))
    dataset.rename_column_('bio', 'labels')
    dataset = dataset.map(
        lambda examples: tokenize(examples, tokenizer, max_length),
        batched=True,
        cache_file_names={
            k: d._get_cache_file_path(f"{task_name}-{k}-tokenized")
            for k, d in dataset.items()
        })
    dataset = dataset.filter(
        lambda x: not x['overflow'],
        cache_file_names={
            k: d._get_cache_file_path(f"{task_name}-{k}-filtered")
            for k, d in dataset.items()
        })
    dataset.set_format(type='torch',
                       columns=[
                           'input_ids', 'token_type_ids', 'attention_mask',
                           'word_index', 'word_attention_mask', 'labels'
                       ])
    return dataset
Example #2
0
def build_dataset(data_dir, task_name, tokenizer, max_length=512, **kwargs):
    dataset = datasets.load_dataset(
        datasets.Conllu,
        data_dir=data_dir,
        cache_dir=data_dir,
        data_files=datasets.Conllu.default_files(data_dir))
    dataset.remove_columns_(
        ["id", "lemma", "upos", "xpos", "feats", "head", "deprel", "misc"])
    dataset = dataset.map(
        lambda examples: tokenize(examples, tokenizer, max_length),
        batched=True,
        cache_file_names={
            k: d._get_cache_file_path(f"{task_name}-{k}-tokenized")
            for k, d in dataset.items()
        })
    dataset = dataset.filter(
        lambda x: not x['overflow'],
        cache_file_names={
            k: d._get_cache_file_path(f"{task_name}-{k}-filtered")
            for k, d in dataset.items()
        })
    dataset.set_format(type='torch',
                       columns=[
                           'input_ids', 'token_type_ids', 'attention_mask',
                           'word_index', 'word_attention_mask', 'head',
                           'labels'
                       ])
    return dataset
Example #3
0
def build_dataset(model, data_dir):
    dataset = datasets.load_dataset(datasets.Conllu,
                                    data_dir=data_dir,
                                    cache_dir=data_dir,
                                    xpos=os.path.join(data_dir,
                                                      "xpos_labels.txt"))
    dataset.remove_columns_(
        ["id", "lemma", "upos", "feats", "head", "deprel", "deps", "misc"])
    dataset.rename_column_('xpos', 'labels')

    tokenizer = AutoTokenizer.from_pretrained(model.hparams.transformer,
                                              use_fast=True)

    def tokenize(examples):
        res = tokenizer(
            examples['form'],
            is_split_into_words=True,
            max_length=model.transformer.config.max_position_embeddings,
            truncation=True)

        labels = []
        logits_mask = []
        for encoding, labels_ in zip(res.encodings, examples['labels']):
            labels.append([])
            logits_mask.append([])

            last_word_idx = -1
            labels_pointer = -1
            for word_idx in encoding.words[1:-1]:
                if word_idx != last_word_idx:
                    logits_mask[-1].append(True)
                    labels_pointer += 1
                    labels[-1].append(labels_[labels_pointer])
                else:
                    logits_mask[-1].append(False)
                    labels[-1].append(labels_[labels_pointer])
                last_word_idx = word_idx
        res['labels'] = labels
        res['logits_mask'] = logits_mask
        return res

    dataset = dataset.map(lambda examples: tokenize(examples),
                          batched=True,
                          cache_file_names={
                              k: d._get_cache_file_path(f"{k}-tokenized")
                              for k, d in dataset.items()
                          })
    dataset.set_format(type='torch',
                       columns=[
                           'input_ids', 'token_type_ids', 'attention_mask',
                           'logits_mask', 'labels'
                       ])
    dataset.shuffle(
        indices_cache_file_names={
            k: d._get_cache_file_path(
                f"{task_info.task_name}-{k}-shuffled-index-{model.hparams.seed}"
            )
            for k, d in dataset.items()
        })
    return dataset, None
Example #4
0
def build_dataset(model, data_dir):
    dataset = datasets.load_dataset(datasets.Bio,
                                    data_dir=data_dir,
                                    cache_dir=data_dir,
                                    bio=os.path.join(data_dir,
                                                     "ner_labels.txt"))
    dataset.rename_column_('bio', 'labels')
    tokenizer = AutoTokenizer.from_pretrained(model.hparams.transformer,
                                              use_fast=True)

    def tokenize(examples):
        res = tokenizer(
            examples['words'],
            is_split_into_words=True,
            max_length=model.transformer.config.max_position_embeddings,
            truncation=True)
        word_index = []
        word_attention_mask = []
        for encoding in res.encodings:
            word_index.append([])
            word_attention_mask.append([])

            last_word_idx = -1
            current_length = 0
            for word_idx in encoding.words[1:-1]:
                if word_idx != last_word_idx:
                    word_index[-1].append(current_length)
                    word_attention_mask[-1].append(True)
                current_length += 1
                last_word_idx = word_idx

        res['word_index'] = word_index
        res['word_attention_mask'] = word_attention_mask
        return res

    dataset = dataset.map(
        lambda examples: tokenize(examples),
        batched=True,
        cache_file_names={
            k: d._get_cache_file_path(f"{task_info.task_name}-{k}-tokenized")
            for k, d in dataset.items()
        })
    dataset.set_format(type='torch',
                       columns=[
                           'input_ids', 'token_type_ids', 'attention_mask',
                           'word_index', 'word_attention_mask', 'labels'
                       ])
    dataset.shuffle(
        indices_cache_file_names={
            k: d._get_cache_file_path(
                f"{task_info.task_name}-{k}-shuffled-index-{model.hparams.seed}"
            )
            for k, d in dataset.items()
        })
    return dataset, (
        f1_score,
        dataset[datasets.Split.TRAIN].features['labels'].feature.names)
Example #5
0
def build_dataset(model, data_dir):
    dataset = datasets.load_dataset(
        datasets.Conllu,
        data_dir=data_dir,
        cache_dir=data_dir,
        deprel=os.path.join(data_dir, "dep_labels.txt")
    )
    dataset.remove_columns_(["id", "lemma", "upos", "xpos", "feats", "deps", "misc"])
    dataset.rename_column_('deprel', 'labels')

    tokenizer = AutoTokenizer.from_pretrained(model.hparams.transformer, use_fast=True)

    # {'B':1, 'I':0}
    def tokenize(examples):
        res = tokenizer(
            examples['form'],
            is_split_into_words=True,
            max_length=model.transformer.config.max_position_embeddings,
            truncation=True
        )
        word_index = []
        word_attention_mask = []
        for encoding in res.encodings:
            word_index.append([])
            word_attention_mask.append([])

            last_word_idx = -1
            current_length = 0
            for word_idx in encoding.words[1:-1]:
                if word_idx != last_word_idx:
                    word_index[-1].append(current_length)
                    word_attention_mask[-1].append(True)
                current_length += 1
                last_word_idx = word_idx

        res['word_index'] = word_index
        res['word_attention_mask'] = word_attention_mask

        return res

    dataset = dataset.map(
        lambda examples: tokenize(examples), batched=True,
        cache_file_names={
            k: d._get_cache_file_path(f"{task_info.task_name}-{k}-tokenized") for k, d in dataset.items()
        }
    )
    dataset.set_format(type='torch', columns=[
        'input_ids', 'token_type_ids', 'attention_mask', 'word_index', 'word_attention_mask', 'head', 'labels'
    ])
    dataset.shuffle(
        indices_cache_file_names={
            k: d._get_cache_file_path(f"{task_info.task_name}-{k}-shuffled-index-{model.hparams.seed}") for k, d in
            dataset.items()
        }
    )
    return dataset, None
Example #6
0
def build_dataset(model: Model, data_dir, task_name):
    dataset = datasets.load_dataset(datasets.Conllu,
                                    data_dir=data_dir,
                                    cache_dir=data_dir)
    dataset.remove_columns_([
        "id", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps",
        "misc"
    ])

    tokenizer = AutoTokenizer.from_pretrained(model.hparams.transformer,
                                              use_fast=True)

    # {'B':1, 'I':0}
    def tokenize(examples):
        res = tokenizer(
            examples['form'],
            is_split_into_words=True,
            max_length=model.transformer.config.max_position_embeddings,
            truncation=True)
        labels = []
        for encoding in res.encodings:
            labels.append([])
            last_word_idx = -1
            for word_idx in encoding.words[1:-1]:
                labels[-1].append(int(word_idx != last_word_idx))
                last_word_idx = word_idx

        res['labels'] = labels
        return res

    dataset = dataset.map(
        lambda examples: tokenize(examples),
        batched=True,
        cache_file_names={
            k: d._get_cache_file_path(f"{task_name}-{k}-tokenized")
            for k, d in dataset.items()
        })
    dataset.set_format(
        type='torch',
        columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
    dataset.shuffle(
        indices_cache_file_names={
            k: d._get_cache_file_path(
                f"{task_name}-{k}-shuffled-index-{model.hparams.seed}")
            for k, d in dataset.items()
        })
    return dataset
Example #7
0
def build_dataset(model, data_dir):
    dataset = datasets.load_dataset(datasets.Conllu,
                                    data_dir=data_dir,
                                    cache_dir=data_dir,
                                    deps=os.path.join(data_dir,
                                                      "deps_labels.txt"))
    dataset.remove_columns_(
        ["id", "lemma", "upos", "xpos", "feats", "head", "deprel", "misc"])
    tokenizer = AutoTokenizer.from_pretrained(model.hparams.transformer,
                                              use_fast=True)

    # {'B':1, 'I':0}
    def tokenize(examples):
        res = tokenizer(
            examples['form'],
            is_split_into_words=True,
            max_length=model.transformer.config.max_position_embeddings,
            truncation=True)
        word_index = []
        word_attention_mask = []
        for encoding in res.encodings:
            word_index.append([])
            word_attention_mask.append([])

            last_word_idx = -1
            current_length = 0
            for word_idx in encoding.words[1:-1]:
                if word_idx != last_word_idx:
                    word_index[-1].append(current_length)
                    word_attention_mask[-1].append(True)
                current_length += 1
                last_word_idx = word_idx

        res['word_index'] = word_index
        res['word_attention_mask'] = word_attention_mask

        heads = []
        labels = []
        for forms, deps in zip(examples['form'], examples['deps']):
            sentence_len = len(forms)
            heads.append(
                np.zeros((sentence_len, sentence_len + 1), dtype=np.int64))
            labels.append(
                np.zeros((sentence_len, sentence_len + 1), dtype=np.int64))
            for idx, head, rel in zip(deps['id'], deps['head'], deps['rel']):
                heads[-1][idx, head] = 1
                labels[-1][idx, head] = rel

        res['head'] = heads
        res['labels'] = labels
        return res

    dataset = dataset.map(lambda examples: tokenize(examples),
                          batched=True,
                          cache_file_names={
                              k: d._get_cache_file_path(f"{k}-tokenized")
                              for k, d in dataset.items()
                          })
    dataset.set_format(type='torch',
                       columns=[
                           'input_ids', 'token_type_ids', 'attention_mask',
                           'word_index', 'word_attention_mask', 'head',
                           'labels'
                       ])
    dataset.shuffle()
    return dataset, get_graph_entities
Example #8
0
def build_dataset(model: Model, data_dir, task_name):
    dataset = datasets.load_dataset(datasets.Srl,
                                    data_dir=data_dir,
                                    cache_dir=data_dir,
                                    labels=os.path.join(
                                        data_dir, "srl_labels.txt"))
    tokenizer = AutoTokenizer.from_pretrained(model.hparams.transformer,
                                              use_fast=True)

    # {'B':1, 'I':0}
    def tokenize(examples):
        res = tokenizer(
            examples['words'],
            is_split_into_words=True,
            max_length=model.transformer.config.max_position_embeddings,
            truncation=True)
        word_index = []
        for encoding in res.encodings:
            word_index.append([])

            last_word_idx = -1
            current_length = 0
            for word_idx in encoding.words[1:-1]:
                if word_idx != last_word_idx:
                    word_index[-1].append(current_length)
                current_length += 1
                last_word_idx = word_idx

        res['word_index'] = word_index
        res['word_attention_mask'] = [[True] * len(index)
                                      for index in word_index]

        labels = []
        for predicates, roles in zip(examples['predicate'], examples['roles']):
            sentence_len = len(predicates)
            labels.append(
                np.zeros((sentence_len, sentence_len), dtype=np.int64))

            for idx, predicate in enumerate(predicates):
                if predicate != '_':
                    srl = np.asarray(roles.pop(0), dtype=np.int64)
                    labels[-1][idx, :] = srl

        res['labels'] = labels
        return res

    dataset = dataset.map(lambda examples: tokenize(examples),
                          batched=True,
                          cache_file_names={
                              k: d._get_cache_file_path(f"{k}-tokenized")
                              for k, d in dataset.items()
                          })
    dataset.set_format(type='torch',
                       columns=[
                           'input_ids', 'token_type_ids', 'attention_mask',
                           'word_index', 'word_attention_mask', 'labels'
                       ])
    dataset.shuffle(
        indices_cache_file_names={
            k: d._get_cache_file_path(
                f"{task_name}-{k}-shuffled-index-{model.hparams.seed}")
            for k, d in dataset.items()
        })
    return dataset
def build_dataset(model: Model, data_dir, task_name):
    dataset = datasets.load_dataset(datasets.Conllu,
                                    data_dir=data_dir,
                                    cache_dir=data_dir)
    dataset.remove_columns_(
        ["id", "lemma", "upos", "xpos", "feats", "head", "deprel", "misc"])
    tokenizer = AutoTokenizer.from_pretrained(model.hparams.transformer,
                                              use_fast=True)

    # {'B':1, 'I':0}
    def tokenize(examples):
        res = tokenizer(
            examples['form'],
            is_split_into_words=True,
            max_length=model.transformer.config.max_position_embeddings,
            truncation=True)
        word_index = []
        for encoding in res.encodings:
            word_index.append([])

            last_word_idx = -1
            current_length = 0
            for word_idx in encoding.words[1:-1]:
                if word_idx != last_word_idx:
                    word_index[-1].append(current_length)
                current_length += 1
                last_word_idx = word_idx

        res['word_index'] = word_index
        res['word_attention_mask'] = [[True] * len(index)
                                      for index in word_index]

        heads = []
        labels = []
        for forms, deps in zip(examples['form'], examples['deps']):
            sentence_len = len(forms)
            heads.append([[0 for j in range(sentence_len + 1)]
                          for i in range(sentence_len)])
            labels.append([[0 for j in range(sentence_len + 1)]
                           for i in range(sentence_len)])
            for idx, head, rel in zip(deps['id'], deps['head'], deps['rel']):
                heads[-1][idx][head] = 1
                labels[-1][idx][head] = rel
        res['head'] = heads
        res['labels'] = labels
        for word_index, head in zip(res['word_index'], res['head']):
            assert len(word_index) == len(head)
        return res

    dataset = dataset.map(
        lambda examples: tokenize(examples),
        batched=True,
        cache_file_names={
            k: d._get_cache_file_path(f"{task_name}-{k}-tokenized")
            for k, d in dataset.items()
        })
    dataset.set_format(type='torch',
                       columns=[
                           'input_ids', 'token_type_ids', 'attention_mask',
                           'word_index', 'word_attention_mask', 'head',
                           'labels'
                       ])
    dataset.shuffle(
        indices_cache_file_names={
            k: d._get_cache_file_path(
                f"{task_name}-{k}-shuffled-index-{model.hparams.seed}")
            for k, d in dataset.items()
        })
    return dataset