예제 #1
0
    def evaluate_model(self,
                       emails,
                       labels,
                       max_num_sentences=1000000,
                       max_num_chars=100000000,
                       max_rows_tensor=1000000,
                       max_seq_len=128,
                       batch_size=32):
        """
        Evaluate trained BERT model

        :param emails: dataframe where each row contains one column holding email text
        :type emails: cudf.Dataframe
        :param labels: series holding labels for each row in email dataframe
        :type labels: cudf.Series
        :param max_num_sentences: maximum number of sentences to be encoded by tokenizer in one batch
        :type max_num_sentences: int
        :param max_num_chars: maximum number of characters passed to tokenizer
        :type max_num_chars: int
        :param max_rows_tensor: maximum number of rows in a tokenizer output tensor
        :type max_rows_tensor: int
        :param max_seq_len: Limits the length of the sequence returned by tokenizer. If tokenized sentence is shorter than max_seq_len, output will be padded with 0s. If the tokenized sentence is longer than max_seq_len it will be truncated to max_seq_len.
        :type max_seq_len: int
        :param batch_size: batch size
        :type batch_size: int

        Examples
        --------
        >>> from cuml.preprocessing.model_selection import train_test_split
        >>> emails_train, emails_test, labels_train, labels_test = train_test_split(train_emails_df, 'label', train_size=0.8)
        >>> phish_detect.evaluate_model(emails_test, labels_test)
        """
        test_inputs, test_masks, _ = tokenizer.tokenize_df(
            emails,
            self._hashpath,
            max_sequence_length=max_seq_len,
            max_num_sentences=max_num_sentences,
            max_num_chars=max_num_chars,
            max_rows_tensor=max_rows_tensor,
            do_truncate=True)

        test_labels = torch.tensor(labels.to_array())
        test_data = TensorDataset(test_inputs, test_masks, test_labels)
        test_sampler = SequentialSampler(test_data)
        test_dataloader = DataLoader(test_data,
                                     sampler=test_sampler,
                                     batch_size=batch_size)

        tests, true_labels = self._evaluate_testset(test_dataloader)

        flat_tests = [item for sublist in tests for item in sublist]
        flat_tests = np.argmax(flat_tests, axis=1).flatten()
        flat_true_labels = [
            item for sublist in true_labels for item in sublist
        ]

        accuracy = accuracy_score(flat_true_labels, flat_tests)

        return accuracy
예제 #2
0
def test_tokenize_df(tmpdir, input_sentence):
    if torch.cuda.is_available():
        expected_tokens, expected_masks, expected_metadata = get_expected()
        fname = tmpdir.mkdir("tmp_test_tokenizer").join("test1.txt")
        fname.write(input_sentence)

        assert fname.read() == input_sentence

        df = cudf.read_csv(fname, header=None)
        actual_tokens, actual_masks, actual_metadata = tokenizer.tokenize_df(df)

        assert actual_tokens.equal(expected_tokens)
        assert actual_masks.equal(expected_masks)
        assert actual_metadata.equal(expected_metadata)
예제 #3
0
    def train_model(self,
                    emails,
                    labels,
                    max_num_sentences=1000000,
                    max_num_chars=100000000,
                    max_rows_tensor=1000000,
                    learning_rate=3e-5,
                    max_seq_len=128,
                    batch_size=32,
                    epochs=5):
        """
        Train the classifier

        :param emails: dataframe where each row contains one column holding email text
        :type emails: cudf.DataFrame
        :param labels: series holding labels for each row in email dataframe
        :type labels: cudf.Series
        :param max_num_sentences: maximum number of sentences to be encoded by tokenizer in one batch
        :type max_num_sentences: int
        :param max_num_chars: maximum number of characters passed to tokenizer
        :type max_num_chars: int
        :param max_rows_tensor: maximum number of rows in a tokenizer output tensor
        :type max_rows_tensor: int
        :param learning_rate: learning rate
        :type learning_rate: float
        :param max_seq_len: Limits the length of the sequence returned by tokenizer. If tokenized sentence is shorter than max_seq_len, output will be padded with 0s. If the tokenized sentence is longer than max_seq_len it will be truncated to max_seq_len.
        :type max_seq_len: int
        :param batch_size: batch size
        :type batch_size: int
        :param epoch: epoch, default is 5
        :type epoch: int

        Examples
        --------
        >>> from cuml.preprocessing.model_selection import train_test_split
        >>> emails_train, emails_test, labels_train, labels_test = train_test_split(train_emails_df, 'label', train_size=0.8)
        >>> phish_detect.train_model(emails_train, labels_train)
        """
        emails["label"] = labels
        train_emails, validation_emails, train_labels, validation_labels = train_test_split(
            emails, 'label', train_size=0.8, random_state=2)

        # Tokenize training and validation
        train_inputs, train_masks, _ = tokenizer.tokenize_df(
            train_emails,
            self._hashpath,
            max_sequence_length=max_seq_len,
            max_num_sentences=max_num_sentences,
            max_num_chars=max_num_chars,
            max_rows_tensor=max_rows_tensor,
            do_truncate=True)
        validation_inputs, validation_masks, _ = tokenizer.tokenize_df(
            validation_emails,
            self._hashpath,
            max_sequence_length=max_seq_len,
            max_num_sentences=max_num_sentences,
            max_num_chars=max_num_chars,
            max_rows_tensor=max_rows_tensor,
            do_truncate=True)

        # convert labels to tensors
        train_labels = torch.tensor(train_labels.to_array())
        validation_labels = torch.tensor(validation_labels.to_array())

        # create dataloaders
        train_data = TensorDataset(train_inputs, train_masks, train_labels)
        train_sampler = RandomSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=batch_size)

        validation_data = TensorDataset(validation_inputs, validation_masks,
                                        validation_labels)
        validation_sampler = SequentialSampler(validation_data)
        validation_dataloader = DataLoader(validation_data,
                                           sampler=validation_sampler,
                                           batch_size=batch_size)

        self._config_optimizer(learning_rate)

        self._model = self._train(train_dataloader, validation_dataloader,
                                  self._model, epochs)
예제 #4
0
    def predict(self,
                emails,
                max_num_sentences=1000000,
                max_num_chars=100000000,
                max_rows_tensor=1000000,
                max_seq_len=128,
                batch_size=32):
        """
        Predict the class with the trained model

        :param emails: dataframe where each row contains one column holding email text
        :type emails: cudf.DataFrame
        :param max_num_sentences: maximum number of sentences to be encoded by tokenizer in one batch
        :type max_num_sentences: int
        :param max_num_chars: maximum number of characters passed to tokenizer
        :type max_num_chars: int
        :param max_rows_tensor: maximum number of rows in a tokenizer output tensor
        :type max_rows_tensor: int
        :param max_seq_len: Limits the length of the sequence returned by tokenizer. If tokenized sentence is shorter than max_seq_len, output will be padded with 0s. If the tokenized sentence is longer than max_seq_len it will be truncated to max_seq_len.
        :type max_seq_len: int
        :param batch_size: batch size
        :type batch_size: int
        :return: predictions: predicted labels (0 or 1) for each email
        :rtype: cudf.Series

        Examples
        --------
        >>> from cuml.preprocessing.model_selection import train_test_split
        >>> emails_train, emails_test, labels_train, labels_test = train_test_split(train_emails_df, 'label', train_size=0.8)
        >>> phish_detect.train_model(emails_train, labels_train)
        >>> predictions = phish_detect(new_emails_df)
        """
        predict_inputs, predict_masks, _ = tokenizer.tokenize_df(
            emails,
            self._hashpath,
            max_sequence_length=max_seq_len,
            max_num_sentences=max_num_sentences,
            max_num_chars=max_num_chars,
            max_rows_tensor=max_rows_tensor,
            do_truncate=True)

        predict_inputs = predict_inputs.type(torch.LongTensor)
        predict_data = TensorDataset(predict_inputs, predict_masks)
        predict_sampler = SequentialSampler(predict_data)
        predict_dataloader = DataLoader(predict_data,
                                        sampler=predict_sampler,
                                        batch_size=batch_size)

        self._model.eval()

        results = []
        for batch in predict_dataloader:

            batch = tuple(t.to(self._device) for t in batch)

            b_input_ids, b_input_mask = batch

            with torch.no_grad():

                logits = self._model(b_input_ids,
                                     token_type_ids=None,
                                     attention_mask=b_input_mask)[0]

            logits = logits.detach().cpu().numpy()

            results.append(logits)

        preds = [item for sublist in results for item in sublist]
        preds = np.argmax(preds, axis=1).flatten()
        preds = cudf.Series(preds.tolist())

        return preds