Example #1
0
    def evaluate_model(self,
                       test_data,
                       labels,
                       max_seq_len=128,
                       batch_size=32):
        """
        Evaluate trained model

        :param test_data: test data to evaluate model
        :type test_data: cudf.Series
        :param labels: labels for each element in test_data
        :type labels: cudf.Series
        :param max_seq_len: Limits the length of the sequence returned by tokenizer. If tokenized sentence is shorter than max_seq_len, output will be padded with 0s. If the tokenized sentence is longer than max_seq_len it will be truncated to max_seq_len.
        :type max_seq_len: int
        :param batch_size: batch size
        :type batch_size: int

        Examples
        --------
        >>> from cuml.preprocessing.model_selection import train_test_split
        >>> emails_train, emails_test, labels_train, labels_test = train_test_split(train_emails_df, 'label', train_size=0.8)
        >>> sc.evaluate_model(emails_test, labels_test)
        """
        self._model.eval()
        test_gdf = cudf.DataFrame()
        test_gdf["text"] = test_data
        test_gdf["label"] = labels

        test_dataset = Dataset(test_gdf)
        test_dataloader = DataLoader(test_dataset, batchsize=batch_size)

        eval_accuracy = 0
        nb_eval_steps = 0
        for df in test_dataloader.get_chunks():
            b_input_ids, b_input_mask = self._bert_uncased_tokenize(
                df["text"], max_seq_len)
            b_labels = torch.tensor(df["label"].to_numpy())
            with torch.no_grad():
                logits = self._model(b_input_ids,
                                     token_type_ids=None,
                                     attention_mask=b_input_mask)[0]

            logits = logits.type(torch.DoubleTensor).to(self._device)
            logits = cupy.fromDlpack(to_dlpack(logits))
            label_ids = b_labels.type(torch.IntTensor).to(self._device)
            label_ids = cupy.fromDlpack(to_dlpack(label_ids))
            temp_eval_accuracy = self._flatten_accuracy(logits, label_ids)

            eval_accuracy += temp_eval_accuracy
            nb_eval_steps += 1

        accuracy = eval_accuracy / nb_eval_steps

        return float(accuracy)
    def predict(self,
                input_data,
                max_seq_len=128,
                batch_size=32,
                threshold=0.5):
        """
        Predict the class with the trained model

        :param input_data: input text data for prediction
        :type input_data: cudf.Series
        :param max_seq_len: Limits the length of the sequence returned by tokenizer. If tokenized sentence is shorter than max_seq_len, output will be padded with 0s. If the tokenized sentence is longer than max_seq_len it will be truncated to max_seq_len.
        :type max_seq_len: int
        :param batch_size: batch size
        :type batch_size: int
        :param threshold: results with probabilities higher than this will be labeled as positive
        :type threshold: float
        :return: predictions, probabilities: predictions are labels (0 or 1) based on minimum threshold
        :rtype: cudf.Series, cudf.Series

        Examples
        --------
        >>> from cuml.preprocessing.model_selection import train_test_split
        >>> emails_train, emails_test, labels_train, labels_test = train_test_split(train_emails_df, 'label', train_size=0.8)
        >>> sc.train_model(emails_train, labels_train)
        >>> predictions = sc.predict(emails_test, threshold=0.8)
        """

        predict_gdf = cudf.DataFrame()
        predict_gdf["text"] = input_data

        predict_dataset = Dataset(predict_gdf)
        predict_dataloader = DataLoader(predict_dataset, batchsize=batch_size)

        preds = cudf.Series()
        probs = cudf.Series()

        self._model.eval()
        for df in predict_dataloader.get_chunks():
            b_input_ids, b_input_mask = self._bert_uncased_tokenize(
                df["text"], max_seq_len)
            with torch.no_grad():
                logits = self._model(b_input_ids,
                                     token_type_ids=None,
                                     attention_mask=b_input_mask)[0]
                b_probs = torch.sigmoid(logits[:, 1])
                b_preds = b_probs.ge(threshold)

            b_probs = cudf.io.from_dlpack(to_dlpack(b_probs))
            b_preds = cudf.io.from_dlpack(to_dlpack(b_preds))
            preds = preds.append(b_preds)
            probs = probs.append(b_probs)

        return preds, probs
Example #3
0
    def train_model(
        self,
        train_data,
        labels,
        learning_rate=3e-5,
        max_seq_len=128,
        batch_size=32,
        epochs=5,
    ):
        """
        Train the classifier

        :param train_data: text data for training
        :type train_data: cudf.Series
        :param labels: labels for each element in train_data
        :type labels: cudf.Series
        :param learning_rate: learning rate
        :type learning_rate: float
        :param max_seq_len: Limits the length of the sequence returned by tokenizer. If tokenized sentence is shorter than max_seq_len, output will be padded with 0s. If the tokenized sentence is longer than max_seq_len it will be truncated to max_seq_len.
        :type max_seq_len: int
        :param batch_size: batch size
        :type batch_size: int
        :param epoch: epoch, default is 5
        :type epoch: int

        Examples
        --------
        >>> from cuml.preprocessing.model_selection import train_test_split
        >>> emails_train, emails_test, labels_train, labels_test = train_test_split(train_emails_df, 'label', train_size=0.8)
        >>> sc.train_model(emails_train, labels_train)
        """
        train_gdf = cudf.DataFrame()
        train_gdf["text"] = train_data
        train_gdf["label"] = labels

        train_dataset = Dataset(train_gdf)
        train_dataloader = DataLoader(train_dataset, batchsize=batch_size)

        self._config_optimizer(learning_rate)
        self._model.train()  # Enable training mode
        self._tokenizer = SubwordTokenizer(self._hashpath, do_lower_case=True)

        for _ in trange(epochs, desc="Epoch"):
            tr_loss = 0  # Tracking variables
            nb_tr_examples, nb_tr_steps = 0, 0
            for df in train_dataloader.get_chunks():
                b_input_ids, b_input_mask = self._bert_uncased_tokenize(
                    df["text"], max_seq_len)

                b_labels = torch.tensor(df["label"].to_numpy())
                self._optimizer.zero_grad()  # Clear out the gradients
                loss = self._model(b_input_ids,
                                   token_type_ids=None,
                                   attention_mask=b_input_mask,
                                   labels=b_labels)[0]  # forwardpass

                loss.sum().backward()
                self._optimizer.step()  # update parameters
                tr_loss += loss.sum().item()  # get a numeric value
                nb_tr_examples += b_input_ids.size(0)
                nb_tr_steps += 1

            print("Train loss: {}".format(tr_loss / nb_tr_steps))
Example #4
0
        "bankmobile.com",
    ],
    "type": [1, 1, 0, 1],
})
expected_part_df1 = cudf.DataFrame({
    "domain": [
        "studytour.com.tw",
        "cnn.com",
    ],
    "type": [1, 1],
})

expected_part_df2 = cudf.DataFrame({
    "domain": [
        "bakercityherald.com",
        "bankmobile.com",
    ],
    "type": [0, 1],
})
dataset = Dataset(test_df)
dataloader = DataLoader(dataset, batchsize=test_batchsize)


def test_get_chunks():
    df_parts = []
    for df_part in dataloader.get_chunks():
        df_parts.append(df_part)
    assert len(df_parts) == 2
    assert df_parts[0].reset_index(drop=True).equals(expected_part_df1)
    assert df_parts[1].reset_index(drop=True).equals(expected_part_df2)