def test_postprocess_xlnet_answer(qa_test_data, tmp_module):
    qa_processor = QAProcessor(model_name="xlnet-base-cased",
                               cache_dir=tmp_module)
    test_features = qa_processor.preprocess(
        qa_test_data["test_dataset"],
        is_training=False,
        max_question_length=16,
        max_seq_length=64,
        doc_stride=32,
        feature_cache_dir=tmp_module,
    )
    test_loader = dataloader_from_dataset(test_features, shuffle=False)
    qa_extractor = AnswerExtractor(model_name="xlnet-base-cased",
                                   cache_dir=tmp_module)
    predictions = qa_extractor.predict(test_loader)

    qa_processor.postprocess(
        results=predictions,
        examples_file=os.path.join(tmp_module, CACHED_EXAMPLES_TEST_FILE),
        features_file=os.path.join(tmp_module, CACHED_FEATURES_TEST_FILE),
        output_prediction_file=os.path.join(tmp_module, "qa_predictions.json"),
        output_nbest_file=os.path.join(tmp_module, "nbest_predictions.json"),
        output_null_log_odds_file=os.path.join(tmp_module, "null_odds.json"),
    )

    qa_processor.postprocess(
        results=predictions,
        examples_file=os.path.join(tmp_module, CACHED_EXAMPLES_TEST_FILE),
        features_file=os.path.join(tmp_module, CACHED_FEATURES_TEST_FILE),
        unanswerable_exists=True,
        verbose_logging=True,
        output_prediction_file=os.path.join(tmp_module, "qa_predictions.json"),
        output_nbest_file=os.path.join(tmp_module, "nbest_predictions.json"),
        output_null_log_odds_file=os.path.join(tmp_module, "null_odds.json"),
    )
def test_classifier(data, tmpdir):

    df = pd.DataFrame({"text": data[0], "label": data[1]})
    num_labels = len(pd.unique(data[1]))
    model_name = "bert-base-uncased"
    processor = Processor(model_name=model_name, cache_dir=tmpdir)
    ds = processor.dataset_from_dataframe(df, "text", "label")
    dl = dataloader_from_dataset(ds, batch_size=2, num_gpus=0, shuffle=True)
    classifier = SequenceClassifier(model_name=model_name,
                                    num_labels=num_labels,
                                    cache_dir=tmpdir)
    classifier.fit(train_dataloader=dl,
                   num_epochs=1,
                   num_gpus=0,
                   verbose=False)
    preds = classifier.predict(dl, num_gpus=0, verbose=False)
    assert len(preds) == len(data[1])
def test_token_classifier_fit_predict(tmpdir, ner_test_data):
    token_classifier = TokenClassifier(model_name="bert-base-uncased",
                                       num_labels=6,
                                       cache_dir=tmpdir)
    processor = TokenClassificationProcessor(model_name="bert-base-uncased",
                                             cache_dir=tmpdir)

    # test fit, no warmup
    train_dataset = processor.preprocess_for_bert(
        text=ner_test_data["INPUT_TEXT"],
        labels=ner_test_data["INPUT_LABELS"],
        label_map=ner_test_data["LABEL_MAP"],
    )
    train_dataloader = dataloader_from_dataset(train_dataset)
    token_classifier.fit(train_dataloader)

    # test predict, no labels
    _ = token_classifier.predict(train_dataloader, verbose=False)
def test_token_classifier_fit_predict(tmpdir, ner_test_data):
    num_labels = 6
    max_seq_len = MAX_SEQ_LEN
    token_classifier = TokenClassifier(model_name="bert-base-uncased",
                                       num_labels=num_labels,
                                       cache_dir=tmpdir)
    processor = TokenClassificationProcessor(model_name="bert-base-uncased",
                                             cache_dir=tmpdir)

    # test fit, no warmup
    train_dataset = processor.preprocess(
        text=ner_test_data["INPUT_TEXT"],
        max_len=max_seq_len,
        labels=ner_test_data["INPUT_LABELS"],
        label_map=ner_test_data["LABEL_MAP"],
    )
    train_dataloader = dataloader_from_dataset(train_dataset)
    token_classifier.fit(train_dataloader)

    # test predict, no labels
    preds = token_classifier.predict(train_dataloader, verbose=False)
    assert preds.shape == (len(train_dataloader), MAX_SEQ_LEN, num_labels)
def test_classifier_gpu_train_cpu_predict(data, tmpdir):

    df = pd.DataFrame({"text": data[0], "label": data[1]})
    num_labels = len(pd.unique(data[1]))
    model_name = "bert-base-uncased"
    processor = Processor(model_name=model_name, cache_dir=tmpdir)
    ds = processor.dataset_from_dataframe(df, "text", "label")
    dl = dataloader_from_dataset(ds, batch_size=2, num_gpus=1, shuffle=True)
    classifier = SequenceClassifier(model_name=model_name,
                                    num_labels=num_labels,
                                    cache_dir=tmpdir)
    classifier.fit(train_dataloader=dl,
                   num_epochs=1,
                   num_gpus=1,
                   verbose=False)

    # gpu prediction, no model move
    preds = classifier.predict(dl, num_gpus=1, verbose=False)
    assert len(preds) == len(data[1])
    # cpu prediction, need model move
    assert next(classifier.model.parameters()).is_cuda is True
    preds = classifier.predict(dl, num_gpus=0, verbose=False)
    assert next(classifier.model.parameters()).is_cuda is False
def test_AnswerExtractor(qa_test_data, tmp_module):
    # bert
    qa_extractor_bert = AnswerExtractor(cache_dir=tmp_module)
    train_loader_bert = dataloader_from_dataset(
        qa_test_data["train_features_bert"])
    test_loader_bert = dataloader_from_dataset(
        qa_test_data["test_features_bert"], shuffle=False)
    qa_extractor_bert.fit(train_loader_bert, verbose=False, cache_model=True)

    # test saving fine-tuned model
    model_output_dir = os.path.join(tmp_module, "fine_tuned")
    assert os.path.exists(os.path.join(model_output_dir, "pytorch_model.bin"))
    assert os.path.exists(os.path.join(model_output_dir, "config.json"))

    qa_extractor_from_cache = AnswerExtractor(
        cache_dir=tmp_module, load_model_from_dir=model_output_dir)
    qa_extractor_from_cache.predict(test_loader_bert, verbose=False)

    # xlnet
    train_loader_xlnet = dataloader_from_dataset(
        qa_test_data["train_features_xlnet"])
    test_loader_xlnet = dataloader_from_dataset(
        qa_test_data["test_features_xlnet"], shuffle=False)
    qa_extractor_xlnet = AnswerExtractor(model_name="xlnet-base-cased",
                                         cache_dir=tmp_module)
    qa_extractor_xlnet.fit(train_loader_xlnet,
                           verbose=False,
                           cache_model=False)
    qa_extractor_xlnet.predict(test_loader_xlnet, verbose=False)

    # distilbert
    train_loader_xlnet = dataloader_from_dataset(
        qa_test_data["train_features_distilbert"])
    test_loader_xlnet = dataloader_from_dataset(
        qa_test_data["test_features_distilbert"], shuffle=False)
    qa_extractor_distilbert = AnswerExtractor(
        model_name="distilbert-base-uncased", cache_dir=tmp_module)
    qa_extractor_distilbert.fit(train_loader_xlnet,
                                verbose=False,
                                cache_model=False)
    qa_extractor_distilbert.predict(test_loader_xlnet, verbose=False)
def load_dataset(
    local_path=TemporaryDirectory().name,
    test_fraction=0.3,
    random_seed=None,
    train_sample_ratio=1.0,
    test_sample_ratio=1.0,
    model_name="bert-base-uncased",
    to_lower=True,
    cache_dir=TemporaryDirectory().name,
    max_len=MAX_SEQ_LEN,
    trailing_piece_tag="X",
    batch_size=32,
    num_gpus=None,
):
    """
    Load the wikigold dataset and split into training and testing datasets.
    The datasets are preprocessed and can be used to train a NER model or evaluate
    on the testing dataset.

    Args:
        local_path (str, optional): The local file path to save the raw wikigold file.
            Defautls to "~/.nlp_utils/datasets/".
        test_fraction (float, optional): The fraction of testing dataset when splitting.
            Defaults to 0.3.
        random_seed (float, optional): Random seed used to shuffle the data.
            Defaults to None.
        train_sample_ratio (float, optional): The ratio that used to sub-sampling for training.
            Defaults to 1.0.
        test_sample_ratio (float, optional): The ratio that used to sub-sampling for testing.
            Defaults to 1.0.
        model_name (str, optional): The pretained model name.
            Defaults to "bert-base-uncased".
        to_lower (bool, optional): Lower case text input.
            Defaults to True.
        cache_dir (str, optional): The default folder for saving cache files.
            Defaults to './temp'.
        max_len (int, optional): Maximum length of the list of tokens. Lists longer
            than this are truncated and shorter ones are padded with "O"s.
            Default value is BERT_MAX_LEN=512.
        trailing_piece_tag (str, optional): Tag used to label trailing word pieces.
            For example, "criticize" is broken into "critic" and "##ize", "critic"
            preserves its original label and "##ize" is labeled as trailing_piece_tag.
            Default value is "X".
        batch_size (int, optional): The batch size for training and testing.
            Defaults to 32.
        num_gpus (int, optional): The number of GPUs.
            Defaults to None.

    Returns:
        tuple. The tuple contains four elements.
        train_dataloader (DataLoader): a PyTorch DataLoader instance for training.
        test_dataloader (DataLoader): a PyTorch DataLoader instance for testing.
        label_map (dict): A dictionary object to map a label (str) to an ID (int).
        test_dataset (TensorDataset): A TensorDataset containing the following four tensors.
            1. input_ids_all: Tensor. Each sublist contains numerical values,
                i.e. token ids, corresponding to the tokens in the input text data.
            2. input_mask_all: Tensor. Each sublist contains the attention
                mask of the input token id list, 1 for input tokens and 0 for
                padded tokens, so that padded tokens are not attended to.
            3. trailing_token_mask_all: Tensor. Each sublist is
                a boolean list, True for the first word piece of each
                original word, False for the trailing word pieces,
                e.g. "##ize". This mask is useful for removing the
                predictions on trailing word pieces, so that each
                original word in the input text has a unique predicted
                label.
            4. label_ids_all: Tensor, each sublist contains token labels of
                a input sentence/paragraph, if labels is provided. If the
                `labels` argument is not provided, it will not return this tensor.
    """

    train_df, test_df = load_train_test_dfs(
        local_cache_path=local_path,
        test_fraction=test_fraction,
        random_seed=random_seed,
    )

    if train_sample_ratio > 1.0:
        train_sample_ratio = 1.0
        logging.warning("Setting the training sample ratio to 1.0")
    elif train_sample_ratio < 0:
        logging.error(
            "Invalid training sample ratio: {}".format(train_sample_ratio))
        raise ValueError(
            "Invalid training sample ratio: {}".format(train_sample_ratio))

    if test_sample_ratio > 1.0:
        test_sample_ratio = 1.0
        logging.warning("Setting the testing sample ratio to 1.0")
    elif test_sample_ratio < 0:
        logging.error(
            "Invalid testing sample ratio: {}".format(test_sample_ratio))
        raise ValueError(
            "Invalid testing sample ratio: {}".format(test_sample_ratio))

    if train_sample_ratio < 1.0:
        train_df = train_df.sample(frac=train_sample_ratio).reset_index(
            drop=True)
    if test_sample_ratio < 1.0:
        test_df = test_df.sample(frac=test_sample_ratio).reset_index(drop=True)

    processor = TokenClassificationProcessor(model_name=model_name,
                                             to_lower=to_lower,
                                             cache_dir=cache_dir)

    label_map = TokenClassificationProcessor.create_label_map(
        label_lists=train_df["labels"], trailing_piece_tag=trailing_piece_tag)
    train_dataset = processor.preprocess(
        text=train_df["sentence"],
        max_len=max_len,
        labels=train_df["labels"],
        label_map=label_map,
        trailing_piece_tag=trailing_piece_tag,
    )

    test_dataset = processor.preprocess(
        text=test_df["sentence"],
        max_len=max_len,
        labels=test_df["labels"],
        label_map=label_map,
        trailing_piece_tag=trailing_piece_tag,
    )

    train_dataloader = dataloader_from_dataset(
        train_dataset,
        batch_size=batch_size,
        num_gpus=num_gpus,
        shuffle=True,
        distributed=False,
    )

    test_dataloader = dataloader_from_dataset(
        test_dataset,
        batch_size=batch_size,
        num_gpus=num_gpus,
        shuffle=False,
        distributed=False,
    )

    return (train_dataloader, test_dataloader, label_map, test_dataset)
Esempio n. 8
0
def load_tc_dataset(
    local_path=TemporaryDirectory().name,
    test_fraction=0.25,
    random_seed=None,
    train_sample_ratio=1.0,
    test_sample_ratio=1.0,
    model_name="bert-base-uncased",
    to_lower=True,
    cache_dir=TemporaryDirectory().name,
    max_len=MAX_SEQ_LEN,
    batch_size=32,
    num_gpus=None,
):
    """
    Load the multinli dataset and split into training and testing datasets.
    The datasets are preprocessed and can be used to train a NER model or evaluate
    on the testing dataset.

    Args:
        local_path (str, optional): The local file path to save the raw wikigold file.
            Defautls to TemporaryDirectory().name.
        test_fraction (float, optional): The fraction of testing dataset when splitting.
            Defaults to 0.25.
        random_seed (float, optional): Random seed used to shuffle the data.
            Defaults to None.
        train_sample_ratio (float, optional): The ratio that used to sub-sampling for training.
            Defaults to 1.0.
        test_sample_ratio (float, optional): The ratio that used to sub-sampling for testing.
            Defaults to 1.0.
        model_name (str, optional): The pretained model name.
            Defaults to "bert-base-uncased".
        to_lower (bool, optional): Lower case text input.
            Defaults to True.
        cache_dir (str, optional): The default folder for saving cache files.
            Defaults to TemporaryDirectory().name.
        max_len (int, optional): Maximum length of the list of tokens. Lists longer
            than this are truncated and shorter ones are padded with "O"s. 
            Default value is BERT_MAX_LEN=512.
        batch_size (int, optional): The batch size for training and testing.
            Defaults to 32.
        num_gpus (int, optional): The number of GPUs.
            Defaults to None.

    Returns:
        tuple. The tuple contains four elements:
        train_dataloader (DataLoader): a PyTorch DataLoader instance for training.

        test_dataloader (DataLoader): a PyTorch DataLoader instance for testing.
        
        label_encoder (LabelEncoder): a sklearn LabelEncoder instance. The label values
            can be retrieved by calling the `inverse_transform` function.
        
        test_labels (Series): a Pandas Series of testing label (in label ID format). If
            the labels are in raw label values format, we will need to transform it to 
            label IDs by using the label_encoder.transform function.
    """

    # download and load the original dataset
    all_df = load_pandas_df(local_cache_path=local_path, num_rows=None)

    # set the text and label columns
    text_col = all_df.columns[0]
    label_col = all_df.columns[1]

    label_encoder = LabelEncoder()
    label_encoder.fit(["culture", "diverse", "economy", "politics", "sports"])

    # remove empty documents
    all_df = all_df[all_df[text_col].isna() == False]

    if test_fraction < 0 or test_fraction >= 1.0:
        logging.warning(
            "Invalid test fraction value: {}, changed to 0.25".format(
                test_fraction))
        test_fraction = 0.25

    train_df, test_df = train_test_split(all_df,
                                         train_size=(1.0 - test_fraction),
                                         random_state=random_seed)

    if train_sample_ratio > 1.0:
        train_sample_ratio = 1.0
        logging.warning("Setting the training sample ratio to 1.0")
    elif train_sample_ratio < 0:
        logging.error(
            "Invalid training sample ration: {}".format(train_sample_ratio))
        raise ValueError(
            "Invalid training sample ration: {}".format(train_sample_ratio))

    if test_sample_ratio > 1.0:
        test_sample_ratio = 1.0
        logging.warning("Setting the testing sample ratio to 1.0")
    elif test_sample_ratio < 0:
        logging.error(
            "Invalid testing sample ration: {}".format(test_sample_ratio))
        raise ValueError(
            "Invalid testing sample ration: {}".format(test_sample_ratio))

    if train_sample_ratio < 1.0:
        train_df = train_df.sample(frac=train_sample_ratio).reset_index(
            drop=True)
    if test_sample_ratio < 1.0:
        test_df = test_df.sample(frac=test_sample_ratio).reset_index(drop=True)

    processor = Processor(model_name=model_name,
                          to_lower=to_lower,
                          cache_dir=cache_dir)

    train_dataset = processor.dataset_from_dataframe(
        df=train_df,
        text_col=text_col,
        label_col=label_col,
        max_len=max_len,
    )
    train_dataloader = dataloader_from_dataset(train_dataset,
                                               batch_size=batch_size,
                                               num_gpus=num_gpus,
                                               shuffle=True)

    test_dataset = processor.dataset_from_dataframe(
        df=test_df,
        text_col=text_col,
        label_col=label_col,
        max_len=max_len,
    )
    test_dataloader = dataloader_from_dataset(test_dataset,
                                              batch_size=batch_size,
                                              num_gpus=num_gpus,
                                              shuffle=False)

    # the DAC dataset already converted the labels to label ID format
    test_labels = test_df[label_col]
    return (train_dataloader, test_dataloader, label_encoder, test_labels)