Ejemplo n.º 1
0
    def test_push_dataset_dict_to_hub_custom_features(self):
        features = Features({
            "x": Value("int64"),
            "y": ClassLabel(names=["neg", "pos"])
        })
        ds = Dataset.from_dict({
            "x": [1, 2, 3],
            "y": [0, 0, 1]
        },
                               features=features)

        local_ds = DatasetDict({"test": ds})

        ds_name = f"{USER}/test-{int(time.time() * 10e3)}"
        try:
            local_ds.push_to_hub(ds_name, token=self._token)
            hub_ds = load_dataset(ds_name, download_mode="force_redownload")

            self.assertDictEqual(local_ds.column_names, hub_ds.column_names)
            self.assertListEqual(list(local_ds["test"].features.keys()),
                                 list(hub_ds["test"].features.keys()))
            self.assertDictEqual(local_ds["test"].features,
                                 hub_ds["test"].features)
        finally:
            self._api.delete_repo(ds_name.split("/")[1],
                                  organization=ds_name.split("/")[0],
                                  token=self._token,
                                  repo_type="dataset")
Ejemplo n.º 2
0
def load_datasets(lang="es", random_state=2021, preprocessing_args={}):
    """
    Load emotion recognition datasets
    """

    train_df = load_df(paths[lang]["train"])
    test_df = load_df(paths[lang]["test"])
    train_df, dev_df = train_test_split(train_df,
                                        stratify=train_df["label"],
                                        random_state=random_state)

    for df in [train_df, dev_df, test_df]:
        for label, idx in label2id.items():
            df.loc[df["label"] == label, "label"] = idx
        df["label"] = df["label"].astype(int)

    preprocess = lambda x: preprocess_tweet(x, lang=lang, **preprocessing_args)

    train_df.loc[:, "text"] = train_df["text"].apply(preprocess)
    dev_df.loc[:, "text"] = dev_df["text"].apply(preprocess)
    test_df.loc[:, "text"] = test_df["text"].apply(preprocess)

    features = Features({
        'text':
        Value('string'),
        'label':
        ClassLabel(num_classes=len(id2label),
                   names=[id2label[k] for k in sorted(id2label.keys())])
    })

    train_dataset = Dataset.from_pandas(train_df, features=features)
    dev_dataset = Dataset.from_pandas(dev_df, features=features)
    test_dataset = Dataset.from_pandas(test_df, features=features)

    return train_dataset, dev_dataset, test_dataset
Ejemplo n.º 3
0
def complex_dataset():
    features = {
        "translation": Translation(languages=("en", "fr")),
        "sentiment": ClassLabel(num_classes=2),
    }

    return datasets.Dataset.from_dict(COMPLEX_DATA, Features(features))
Ejemplo n.º 4
0
def dictionary():
    table = csv.read_csv("./data/train.csv",
                         parse_options=ParseOptions(delimiter="\t"))
    # datasets = load_dataset("csv", data_files="./data/train.csv", delimiter="\t", quoting=csv_lib.QUOTE_NONE)
    # train_dataset: Dataset = datasets["train"]
    # train_dataset = Dataset(arrow_table=table)
    # table = train_dataset.data
    aa = set(table.column("label").to_pylist())

    class_label_ = table.column("label").unique()
    class_label = ClassLabel(num_classes=len(class_label_),
                             names=class_label_.tolist())
    # ner_ids_list: ChunkedArray = class_label.str2int(label.column('label').to_numpy())
    return class_label
def add_label_names(dataset: Dataset, label_column: str,
                    label_names: List[str]):
    """Adds `names` to a specified `label` column.
    All labels (i.e. integers) in the dataset should be < than the number of label names.
    Args:
        dataset: a Dataset to add label names to
        label_column: the name of the label column (such as `label` or `labels`) in the dataset
        label_names: a list of label names
    Returns:
        Dataset: A copy of the passed `dataset` with added label names
    """
    new_features: Features = dataset.features.copy()
    new_features[label_column] = ClassLabel(names=label_names)
    return dataset.cast(new_features)
Ejemplo n.º 6
0
    def test_push_dataset_to_hub_custom_features(self):
        features = Features({"x": Value("int64"), "y": ClassLabel(names=["neg", "pos"])})
        ds = Dataset.from_dict({"x": [1, 2, 3], "y": [0, 0, 1]}, features=features)

        ds_name = f"{USER}/test-{int(time.time() * 10e3)}"
        try:
            ds.push_to_hub(ds_name, token=self._token)
            hub_ds = load_dataset(ds_name, split="train", download_mode="force_redownload")

            self.assertListEqual(ds.column_names, hub_ds.column_names)
            self.assertListEqual(list(ds.features.keys()), list(hub_ds.features.keys()))
            self.assertDictEqual(ds.features, hub_ds.features)
            self.assertEqual(ds[:], hub_ds[:])
        finally:
            self.cleanup_repo(ds_name)
Ejemplo n.º 7
0
def read_dataset_from_csv(csv_path):
    """
    read the prepared csv data as Dataset object
    """
    df = pd.read_csv(csv_path,
                     converters={
                         'token': str,
                         'written': str,
                         'spoken': str
                     })
    feature_tag = Sequence(
        ClassLabel(num_classes=3, names=list(pd.factorize(df['tag'])[1])))
    df['tag'] = df['tag'].apply(feature_tag.feature.str2int)
    df_text = df.groupby(['sentence_id']).agg({'token': list, 'tag': list})
    dataset = Dataset.from_pandas(df_text)
    dataset.features["tag"] = feature_tag
    return dataset
Ejemplo n.º 8
0
def load_datasets(preprocess_args={}):
    """
    Return train, dev, test datasets
    """
    train_files = glob(os.path.join(tass_dir, "train/*.tsv"))
    dev_files = glob(os.path.join(tass_dir, "dev/*.tsv"))
    test_files = glob(os.path.join(tass_dir, "test1.1/*.tsv"))

    train_dfs = {get_lang(file): load_df(file) for file in train_files}
    dev_dfs = {get_lang(file): load_df(file) for file in dev_files}
    test_dfs = {
        get_lang(file): load_df(file, test=True)
        for file in test_files
    }

    train_df = pd.concat(train_dfs.values())
    dev_df = pd.concat(dev_dfs.values())
    test_df = pd.concat(test_dfs.values())

    print(len(train_df), len(dev_df), len(test_df))
    """
    Tokenize tweets
    """

    preprocess_with_args = lambda x: preprocess_tweet(x, **preprocess_args)

    train_df["text"] = train_df["text"].apply(preprocess_with_args)
    dev_df["text"] = dev_df["text"].apply(preprocess_with_args)
    test_df["text"] = test_df["text"].apply(preprocess_with_args)

    features = Features({
        'text':
        Value('string'),
        'lang':
        Value('string'),
        'label':
        ClassLabel(num_classes=3, names=["neg", "neu", "pos"])
    })

    columns = ["text", "lang", "label"]

    train_dataset = Dataset.from_pandas(train_df[columns], features=features)
    dev_dataset = Dataset.from_pandas(dev_df[columns], features=features)
    test_dataset = Dataset.from_pandas(test_df[columns], features=features)

    return train_dataset, dev_dataset, test_dataset
Ejemplo n.º 9
0
def load_datasets(seed=2021, preprocessing_args={}):
    """
    Return train, dev, test datasets
    """
    train_df = load_df(os.path.join(semeval_dir, "train.csv"))

    test_df = load_df(os.path.join(semeval_dir, "test.csv"))

    train_df, dev_df = train_test_split(train_df, test_size=0.2)

    print(len(train_df), len(dev_df), len(test_df))
    """
    Tokenize tweets
    """

    en_preprocess = lambda x: preprocess_tweet(
        x, lang="en", **preprocessing_args)

    train_df["text"] = train_df["text"].apply(en_preprocess)
    dev_df["text"] = dev_df["text"].apply(en_preprocess)
    test_df["text"] = test_df["text"].apply(en_preprocess)

    features = Features({
        'id':
        Value('int64'),
        'text':
        Value('string'),
        'label':
        ClassLabel(num_classes=3, names=["NEG", "NEU", "POS"])
    })

    columns = ["text", "id", "label"]

    train_dataset = Dataset.from_pandas(train_df[columns], features=features)
    dev_dataset = Dataset.from_pandas(dev_df[columns], features=features)
    test_dataset = Dataset.from_pandas(test_df[columns], features=features)

    return train_dataset, dev_dataset, test_dataset
def load_datasets():
    """
    Return train, dev, test datasets
    """
    train_files = glob("data/tass2020/train/*.tsv")
    dev_files = glob("data/tass2020/dev/*.tsv")
    test_files = glob("data/tass2020/test1.1/*.tsv")

    train_dfs = {get_lang(file): load_df(file) for file in train_files}
    dev_dfs = {get_lang(file): load_df(file) for file in dev_files}
    test_dfs = {get_lang(file): load_df(file) for file in test_files}

    train_df = pd.concat(train_dfs.values())
    dev_df = pd.concat(dev_dfs.values())
    test_df = pd.concat(test_dfs.values())

    print(len(train_df), len(dev_df), len(test_df))

    train_df["text"] = train_df["text"].apply(preprocess_tweet)
    dev_df["text"] = dev_df["text"].apply(preprocess_tweet)
    test_df["text"] = test_df["text"].apply(preprocess_tweet)

    features = Features({
        'text':
        Value('string'),
        'label':
        ClassLabel(num_classes=3, names=["neg", "neu", "pos"])
    })

    train_dataset = Dataset.from_pandas(train_df[["text", "label"]],
                                        features=features)
    dev_dataset = Dataset.from_pandas(dev_df[["text", "label"]],
                                      features=features)
    test_dataset = Dataset.from_pandas(test_df[["text", "label"]],
                                       features=features)

    return train_dataset, dev_dataset, test_dataset
Ejemplo n.º 11
0
    def predict(self, input_path, output_path):
        key = 'tmp'
        input_df = pd.DataFrame()

        if self.pretrained:
            input_df['src_token'] = read_txt(input_path)
            input_df['src_token'] = input_df['src_token'].str.lower()
            input_df['token'] = input_df['src_token'].str.split()
            input_df['tag'] = input_df['token'].apply(lambda x: ['O'] * len(x))
            input_df['sentence_id'] = input_df.index

            trainer = Trainer(model=self.model,
                              tokenizer=self.tokenizer,
                              data_collator=self.data_collator)
            feature_tag = Sequence(
                ClassLabel(num_classes=3, names=self.label_list))
            input_df['tag'] = input_df['tag'].apply(
                feature_tag.feature.str2int)
            eval_dataset = Dataset.from_pandas(input_df)
            eval_dataset.features["tag"] = feature_tag
            # predict
            tokenized_datasets = DatasetDict({
                key: eval_dataset
            }).map(self.tokenize_and_align_labels, batched=True)
            _, true_predictions = self.predict_dataset(trainer,
                                                       tokenized_datasets[key])
            result = save_classifier_result(eval_dataset, true_predictions,
                                            output_path)
            return result
        else:
            input_df['token'] = read_txt(input_path)
            input_df['sentence_id'] = input_df.index
            input_df['tag'] = 'B'
            input_df.to_csv(output_path, index=False)
            print("Result saved to ", output_path)
            return input_df
Ejemplo n.º 12
0
    'fn': 'grey',
    'tb': 'beige'
}
id2label = {v: k for k, v in label2id.items()}
LABELS = [label2id[L] for L in LABELS]
from datasets import Features, Sequence, ClassLabel, Value, Array2D, Array3D
FEATURES = Features({
    'image':
    Array3D(dtype="int64", shape=(3, 224, 224)),
    'input_ids':
    Sequence(feature=Value(dtype='int64')),
    'attention_mask':
    Sequence(Value(dtype='int64')),
    'token_type_ids':
    Sequence(Value(dtype='int64')),
    'bbox':
    Array2D(dtype="int64", shape=(512, 4)),
    'labels':
    Sequence(ClassLabel(names=LABELS + [max(LABELS) + 1]))
})
NUM_LABELS = len(LABELS)
PROCESSOR_PICKLE = f"processor_module{NUM_LABELS}.pickle"
MODEL_PICKLE = f"model_module{NUM_LABELS}.pickle"
EPOCHS_LAYOUT = 84
PDF_UPLOAD_DIR = hidden_folder + "/pdf_upload/"
ELMO_DIFFERENCE_MODEL_PATH = hidden_folder + "elmo_difference_models"
ELMO_DIFFERENCE_COLLECTION_PATH = hidden_folder + "elmo_difference_collection"
PORT = 7789

TOPIC_TEXT_LENGTH = 180
Ejemplo n.º 13
0
 def label_mapper(x):
     labels = ClassLabel(names=["neutral", "entails"])
     return {"label": labels.str2int(x)}
def get_dataset(tokenizer,
                model_id=0,
                args=None,
                output_all_cols=False,
                data_dir=''):
    ds_path = os.path.join(data_dir,
                           f'task2/preprocessed_data/model{model_id}',
                           args.transformer)
    print(f'Dataset path: {ds_path}')
    try:
        encoded_ds = DatasetDict.load_from_disk(ds_path)
        print('Reloaded persisted dataset.')
    except:
        ds: DatasetDict = load_dataset("humicroedit", "subtask-2")
        glove, synset_sizes = None, None
        if model_id == 0:
            glove = torchtext.vocab.GloVe(name='840B',
                                          dim=300,
                                          cache=os.path.join(
                                              os.environ['HOME'],
                                              '.vector_cache'))
            synset_sizes = get_synsets_sizes(ds, task=2)

        for i in range(2):
            ds = ds.rename_column(f'edit{i+1}', f'word_fin{i+1}')
            ds = ds.map(
                get_preprocess_ds(glove=glove,
                                  synset_sizes=synset_sizes,
                                  idx=i + 1))
            ds = ds.remove_columns([f'original{i+1}'])
            ds = ds.rename_column(f'meanGrade{i+1}', f'grade{i+1}')

        if model_id == 2:
            ds = ds.map(add_T5_input)

        ds = ds.rename_column('label', 'labels')
        binary_ds = ds.filter(lambda ex: ex['labels'] != 0).\
            map(lambda ex: {'labels': ex['labels'] - 1})
        binary_ds_features = ds['train'].features.copy()
        binary_ds_features['labels'] = ClassLabel(
            names=ds['train'].features['labels'].names[1:])
        binary_ds = binary_ds.cast(binary_ds_features)

        encode_fn = get_encode(tokenizer, model_id=model_id)
        encoded_ds = binary_ds.map(encode_fn, batched=True, batch_size=100)

        print('Saving preprocessed dataset.')
        os.makedirs(ds_path)
        encoded_ds.save_to_disk(ds_path)

    if model_id == 0:
        from task1.data import get_encoded_ds_cols
        encoded_ds_cols = get_encoded_ds_cols(args)
        encoded_ds_cols = [
            f'{col}{i+1}' for i in range(2) for col in encoded_ds_cols
        ]
        encoded_ds_cols += ['grade1', 'grade2']
    elif model_id == 1 and args.transformer != 'distilbert-base-cased':
        encoded_ds_cols = ['input_ids', 'token_type_ids', 'attention_mask']
    else:
        encoded_ds_cols = ['input_ids', 'attention_mask']

    for _ds in encoded_ds.values():
        _ds.set_format(type='torch',
                       columns=encoded_ds_cols + ['labels'],
                       output_all_columns=output_all_cols)

    return encoded_ds
Ejemplo n.º 15
0
bug_types = [
    bug.bug_type for bug in sstubs if bug.bug_type not in ignored_bug_types
]

train_data, test_data, train_labels, test_labels = train_test_split(
    all_data, all_labels, test_size=0.2, random_state=42, stratify=bug_types)

train_data = itertools.chain.from_iterable(train_data)
train_labels = itertools.chain.from_iterable(train_labels)
test_data = itertools.chain.from_iterable(test_data)
test_labels = itertools.chain.from_iterable(test_labels)

class_names = ['not_buggy', 'buggy']
features = Features({
    'text': Value('string'),
    'label': ClassLabel(names=class_names)
})

raw_train_dataset = Dataset.from_dict(
    {
        'text': train_data,
        'label': train_labels
    },
    features=features,
)
raw_val_dataset = Dataset.from_dict(
    {
        'text': test_data,
        'label': test_labels
    },
    features=features,
def main():
    args = get_args()
    set_seed(args.seed)

    dataset = load_dataset("codeparrot/codecomplex", split="train")
    train_test = dataset.train_test_split(test_size=0.2)
    test_validation = train_test["test"].train_test_split(test_size=0.5)
    train_test_validation = DatasetDict({
        "train": train_test["train"],
        "test": test_validation["train"],
        "valid": test_validation["test"],
    })

    print("Loading tokenizer and model")
    tokenizer = AutoTokenizer.from_pretrained(args.model_ckpt)
    tokenizer.pad_token = tokenizer.eos_token
    model = AutoModelForSequenceClassification.from_pretrained(args.model_ckpt,
                                                               num_labels=7)
    model.config.pad_token_id = model.config.eos_token_id

    if args.freeze:
        for param in model.roberta.parameters():
            param.requires_grad = False

    labels = ClassLabel(num_classes=7,
                        names=list(
                            set(train_test_validation["train"]["complexity"])))

    def tokenize(example):
        inputs = tokenizer(example["src"], truncation=True, max_length=1024)
        label = labels.str2int(example["complexity"])
        return {
            "input_ids": inputs["input_ids"],
            "attention_mask": inputs["attention_mask"],
            "label": label,
        }

    tokenized_datasets = train_test_validation.map(
        tokenize,
        batched=True,
        remove_columns=train_test_validation["train"].column_names,
    )
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    training_args = TrainingArguments(
        output_dir=args.output_dir,
        learning_rate=args.learning_rate,
        lr_scheduler_type=args.lr_scheduler_type,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",
        per_device_train_batch_size=args.batch_size,
        per_device_eval_batch_size=args.batch_size,
        num_train_epochs=args.num_epochs,
        gradient_accumulation_steps=args.gradient_accumulation_steps,
        weight_decay=0.01,
        metric_for_best_model="accuracy",
        run_name="complexity-java",
        report_to="wandb",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["valid"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    print("Training...")
    trainer.add_callback(CustomCallback(trainer))
    trainer.train()
Ejemplo n.º 17
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty."
            "Use --overwrite_output_dir to overcome.")

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if is_main_process(training_args.local_rank) else logging.WARN,
    )

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        +
        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
    #
    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
    # 'text' is found. You can easily tweak this behavior (see below).
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        datasets = load_dataset(data_args.dataset_name,
                                data_args.dataset_config_name)
    else:
        data_files = {}
        if data_args.train_file is not None:
            data_files["train"] = data_args.train_file
        if data_args.validation_file is not None:
            data_files["validation"] = data_args.validation_file
        if data_args.test_file is not None:
            data_files["test"] = data_args.test_file

        table = csv.read_csv("./data/train.csv",
                             parse_options=ParseOptions(delimiter="\t"))
        class_label_ = table.column("label").unique()
        class_label = ClassLabel(num_classes=len(class_label_),
                                 names=class_label_.tolist())
        train = main_ner.process_data(data_args.train_file, class_label)
        test = main_ner.process_data(data_args.test_file, class_label)
        val = main_ner.process_data(data_args.validation_file, class_label)

        # table = csv.read_csv(data_args.train_file)
        extension = data_args.train_file.split(".")[-1]
        datasets = load_dataset(extension,
                                data_files=data_files,
                                delimiter="\t",
                                quoting=csv_lib.QUOTE_NONE)
        train_dataset = datasets["train"]
        test_dataset = datasets["test"]
        val_dataset = datasets["validation"]

        table = train_dataset.data
        label = table.column("label")
        class_label_ = label.unique()
        class_label = Sequence(feature=ClassLabel(
            num_classes=len(class_label_), names=class_label_.tolist()))

        train_dataset.features['ner_tags'] = class_label
        # train_ner_list: ChunkedArray = class_label.feature.str2int(train_dataset.data.column('label').to_numpy())
        # train_ner_array = pa.array(train_ner_list)
        # train_data = train_dataset.data.append_column("ner_tags", train_ner_array)
        train_dataset._data = train

        test_dataset.features['ner_tags'] = class_label
        test_dataset._data = test

        val_dataset.features['ner_tags'] = class_label
        # val_ner_list: ChunkedArray = class_label.feature.str2int(val_dataset.data.column('label').to_numpy())
        # val_ner_array = pa.array(val_ner_list)
        # val_data = val_dataset.data.append_column("ner_tags", val_ner_array)
        val_dataset._data = val

    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    if training_args.do_train:
        column_names = datasets["train"].column_names
        features = datasets["train"].features
    else:
        column_names = datasets["validation"].column_names
        features = datasets["validation"].features
    text_column_name = "tokens" if "tokens" in column_names else column_names[0]
    label_column_name = (f"{data_args.task_name}_tags"
                         if f"{data_args.task_name}_tags" in column_names else
                         column_names[1])

    # In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the
    # unique labels.
    def get_label_list(labels):
        unique_labels = set()
        for label in labels:
            unique_labels = unique_labels | set(label)
        label_list = list(unique_labels)
        label_list.sort()
        return label_list

    seq: Sequence = features[label_column_name]
    # label_list = ["O", "B-GENE", "I-GENE"]
    # label_to_id = {i: i for i in range(len(label_list))}
    if isinstance(seq.feature, ClassLabel):
        label_list = features[label_column_name].feature.names
        # No need to convert the labels since they are already ints.
        label_to_id = {i: i for i in range(len(label_list))}
    else:
        label_list = get_label_list(datasets["train"][label_column_name])
        label_to_id = {l: i for i, l in enumerate(label_list)}
    num_labels = len(label_list)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    config = AutoConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=data_args.task_name,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=True,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    model = AutoModelForTokenClassification.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )

    # Tokenizer check: this script requires a fast tokenizer.
    if not isinstance(tokenizer, PreTrainedTokenizerFast):
        raise ValueError(
            "This example script only works for models that have a fast tokenizer. Checkout the big table of models "
            "at https://huggingface.co/transformers/index.html#bigtable to find the model types that meet this "
            "requirement")

    # Preprocessing the dataset
    # Padding strategy
    padding = "max_length" if data_args.pad_to_max_length else False

    # Tokenize all texts and align the labels with them.
    def tokenize_and_align_labels(examples):
        tokenized_inputs = tokenizer(
            examples[text_column_name],
            padding=padding,
            truncation=True,
            # We use this argument because the texts in our dataset are lists of words (with a label for each word).
            is_split_into_words=True,
        )
        labels = []
        if len(examples) == 3:
            for i, label in enumerate(examples[label_column_name]):
                word_ids = tokenized_inputs.word_ids(batch_index=i)
                previous_word_idx = None
                label_ids = []
                for word_idx in word_ids:
                    # Special tokens have a word id that is None. We set the label to -100 so they are automatically
                    # ignored in the loss function.
                    if word_idx is None:
                        label_ids.append(-100)
                    # We set the label for the first token of each word.
                    elif word_idx != previous_word_idx:
                        label_ids.append(label_to_id[label[word_idx]])
                    # For the other tokens in a word, we set the label to either the current label or -100, depending on
                    # the label_all_tokens flag.
                    else:
                        label_ids.append(label_to_id[label[word_idx]] if
                                         data_args.label_all_tokens else -100)
                    previous_word_idx = word_idx

                labels.append(label_ids)
            tokenized_inputs["labels"] = labels
            return tokenized_inputs
        else:
            print("asdasdsa")

    tokenized_datasets = datasets.map(
        tokenize_and_align_labels,
        batched=True,
        num_proc=data_args.preprocessing_num_workers,
        load_from_cache_file=not data_args.overwrite_cache,
    )

    # Data collator
    data_collator = DataCollatorForTokenClassification(tokenizer)

    # Metrics
    metric = load_metric("seqeval")

    def compute_metrics(p):
        predictions, labels = p
        predictions = np.argmax(predictions, axis=2)

        # Remove ignored index (special tokens)
        true_predictions = [[
            label_list[p] for (p, l) in zip(prediction, label) if l != -100
        ] for prediction, label in zip(predictions, labels)]
        true_labels = [[
            label_list[l] for (p, l) in zip(prediction, label) if l != -100
        ] for prediction, label in zip(predictions, labels)]

        results = metric.compute(predictions=true_predictions,
                                 references=true_labels)
        if data_args.return_entity_level_metrics:
            # Unpack nested dictionaries
            final_results = {}
            for key, value in results.items():
                if isinstance(value, dict):
                    for n, v in value.items():
                        final_results[f"{key}_{n}"] = v
                else:
                    final_results[key] = value
            return final_results
        else:
            return {
                "precision": results["overall_precision"],
                "recall": results["overall_recall"],
                "f1": results["overall_f1"],
                "accuracy": results["overall_accuracy"],
            }

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"]
        if training_args.do_train else None,
        eval_dataset=tokenized_datasets["validation"]
        if training_args.do_eval else None,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    # Training
    if training_args.do_train:
        train_result = trainer.train(
            model_path=model_args.model_name_or_path if os.path.
            isdir(model_args.model_name_or_path) else None)
        trainer.save_model()  # Saves the tokenizer too for easy upload

        output_train_file = os.path.join(training_args.output_dir,
                                         "train_results.txt")
        if trainer.is_world_process_zero():
            with open(output_train_file, "w") as writer:
                logger.info("***** Train results *****")
                for key, value in sorted(train_result.metrics.items()):
                    logger.info(f"  {key} = {value}")
                    writer.write(f"{key} = {value}\n")

            # Need to save the state, since Trainer.save_model saves only the tokenizer with the model
            trainer.state.save_to_json(
                os.path.join(training_args.output_dir, "trainer_state.json"))

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        results = trainer.evaluate()

        output_eval_file = os.path.join(training_args.output_dir,
                                        "eval_results_ner.txt")
        if trainer.is_world_process_zero():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key, value in results.items():
                    logger.info(f"  {key} = {value}")
                    writer.write(f"{key} = {value}\n")

    # Predict
    if training_args.do_predict:
        logger.info("*** Predict ***")

        test_dataset = tokenized_datasets["test"]
        predictions, labels, metrics = trainer.predict(test_dataset)
        predictions = np.argmax(predictions, axis=2)

        # Remove ignored index (special tokens)
        true_predictions = [[
            label_list[p] for (p, l) in zip(prediction, label) if l != -100
        ] for prediction, label in zip(predictions, labels)]

        output_test_results_file = os.path.join(training_args.output_dir,
                                                "test_results.txt")
        if trainer.is_world_process_zero():
            with open(output_test_results_file, "w") as writer:
                for key, value in sorted(metrics.items()):
                    logger.info(f"  {key} = {value}")
                    writer.write(f"{key} = {value}\n")

        # Save predictions
        output_test_predictions_file = os.path.join(training_args.output_dir,
                                                    "test_predictions.txt")
        if trainer.is_world_process_zero():
            with open(output_test_predictions_file, "w") as writer:
                for prediction in true_predictions:
                    writer.write(" ".join(prediction) + "\n")

    return results
Ejemplo n.º 18
0
def load_datasets(train_path=None, test_path=None, add_body=False, limit=None, preprocess=True):
    """
    Load and return datasets

    Returns
    -------

        train_dataset, dev_dataset, test_datasets: datasets.Dataset
    """
    test_path = test_path or _test_path
    train_path = train_path or _train_path

    with open(train_path) as f:
        train_articles = json.load(f)

    with open(test_path) as f:
        test_articles = json.load(f)


    train_comments = [serialize(article, comment, add_body) for article in train_articles for comment in article["comments"]]
    test_comments = [serialize(article, comment, add_body) for article in test_articles for comment in article["comments"]]

    if limit:
        train_comments = train_comments[:limit]
        test_comments = test_comments[:limit]
    train_df = pd.DataFrame(train_comments)
    test_df = pd.DataFrame(test_comments)

    train_df, dev_df = train_test_split(train_df, test_size=0.2, random_state=20212021)

    """
    Apply preprocessing: convert usernames to "usuario" and urls to URL
    """

    if preprocess:
        from pandarallel import pandarallel
        pandarallel.initialize()

        for df in [train_df, dev_df, test_df]:
            df["text"] = df["original_text"].parallel_apply(preprocess_tweet)
            df["article_text"] = df["article_text"].parallel_apply(preprocess_tweet)

    features = Features({
        'id': Value('uint64'),
        'title': Value('string'),
        'text': Value('string'),
        'article_text': Value('string'),
        'HATEFUL': ClassLabel(num_classes=2, names=["Not Hateful", "Hateful"])
    })

    if add_body:
        features["body"] = Value('string')


    for cat in extended_hate_categories:
        """
        Set for WOMEN, LGBTI...and also for CALLS
        """
        features[cat] = ClassLabel(num_classes=2, names=["NO", "YES"])

    columns = list(features.keys())

    train_dataset = Dataset.from_pandas(train_df[columns], features=features)
    dev_dataset = Dataset.from_pandas(dev_df[columns], features=features)
    test_dataset = Dataset.from_pandas(test_df[columns], features=features)

    return train_dataset, dev_dataset, test_dataset