コード例 #1
0
 def test_fit_predict_multi_model(self):
     """
     Ensure model training does not error out
     Ensure model returns predictions
     """
     self.model = SequenceLabeler(batch_size=2,
                                  max_length=256,
                                  lm_loss_coef=0.0,
                                  multi_label_sequences=True)
     raw_docs = ["".join(text) for text in self.texts]
     texts, annotations = finetune_to_indico_sequence(
         raw_docs,
         self.texts,
         self.labels,
         none_value=self.model.config.pad_token)
     train_texts, test_texts, train_annotations, _ = train_test_split(
         texts, annotations, test_size=0.1)
     self.model.fit(train_texts, train_annotations)
     self.model.predict(test_texts)
     probas = self.model.predict_proba(test_texts)
     self.assertIsInstance(probas, list)
     self.assertIsInstance(probas[0], list)
     self.assertIsInstance(probas[0][0], dict)
     self.assertIsInstance(probas[0][0]['confidence'], dict)
     self.model.save(self.save_file)
     model = SequenceLabeler.load(self.save_file)
     model.predict(test_texts)
コード例 #2
0
ファイル: doc_rep.py プロジェクト: flippersmcgee/Enso
 def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
     self.model_config.update(
         dict(
             # general params that differ from finetune
             base_model=RoBERTa,
             batch_size=4,
             predict_batch_size=10,
             val_size=0.0,
             crf_sequence_labeling=False,
             low_memory_mode=True,
             class_weights="log",
             # auxiliary-specific params
             use_auxiliary_info=True,
             context_dim=4,
             default_context={
                 'left': 0,
                 'right': 0,
                 'top': 0,
                 'bottom': 0,
             },
             n_context_embed_per_channel=48,
             context_in_base_model=True,
             n_layers_with_aux=-1)
     )
     self.model_config.update(kwargs)
     self.model = SequenceLabeler(**self.model_config)
コード例 #3
0
    def setUp(self):
        self.save_file = 'tests/saved-models/test-save-load'
        random.seed(42)
        np.random.seed(42)
        with open(self.processed_path, 'rt') as fp:
            self.texts, self.labels = json.load(fp)

        self.model = SequenceLabeler(**self.default_config())
コード例 #4
0
ファイル: test_sequence.py プロジェクト: takuma-ynd/finetune
    def setUp(self):
        self.save_file = 'tests/saved-models/test-save-load'

        with open(self.processed_path, 'rt') as fp:
            self.texts, self.labels = json.load(fp)
        
        tf.reset_default_graph()

        self.model = SequenceLabeler(batch_size=2, max_length=256, verbose=False)
コード例 #5
0
 def test_sequence_labeler_auxiliary(self):
     """
     Ensure model training does not error out
     Ensure model returns reasonable predictions
     """
     # here we want to make sure we're actually using context
     model = SequenceLabeler(**self.default_config(n_epochs=1500))
     model.fit(self.trainX, self.trainY_seq, context=self.train_context)
     preds = model.predict(self.trainX, context=self.train_context)
     self._evaluate_sequence_preds(preds, includes_context=True)
コード例 #6
0
 def test_sequence_labeler_no_auxiliary(self):
     """
     Ensure model training does not error out
     Ensure model returns reasonable predictions
     """
     model = SequenceLabeler(**self.default_config(
         use_auxiliary_info=False, val_set=(self.trainX, self.trainY)))
     model.fit(self.trainX, self.trainY_seq)
     preds = model.predict(self.trainX)
     self._evaluate_sequence_preds(preds, includes_context=False)
コード例 #7
0
class FinetuneSequenceLabel(ClassificationExperiment):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.model = SequenceLabeler(val_size=0)

    def fit(self, X, y):
        self.model.fit(X, y)

    def predict(self, X, **kwargs):
        return self.model.predict(X)
コード例 #8
0
ファイル: doc_rep.py プロジェクト: flippersmcgee/Enso
 def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
     self.model_config = dict(
         use_auxiliary_info = False,
         n_layers_with_aux = 0,
         context_in_base_model = False,
         context_dim = 0
     )
     self.model_config.update(kwargs)
     self.model = SequenceLabeler(**self.model_config)
コード例 #9
0
    def setUpClass(cls):
        cls._download_data()
        
        #dataset preparation
        cls.classifier_dataset = pd.read_csv(cls.classifier_dataset_path, nrows=cls.n_sample * 10)

        path = os.path.join(os.path.dirname(__file__), "data", "testdata.json")
        with open(path, 'rt') as fp:
            cls.texts, cls.labels = json.load(fp)

        cls.animals = ["dog", "cat", "horse", "cow", "pig", "sheep", "goat", "chicken", "guinea pig", "donkey", "turkey", "duck", "camel", "goose", "llama", "rabbit", "fox"]
        cls.numbers = ["one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen"]
        
        #train and save sequence labeler for later use
        try:
            cls.s = SequenceLabeler.load(cls.sequence_labeler_path, **cls.default_seq_config(cls))
        except FileNotFoundError:
            cls.s = SequenceLabeler(**cls.default_seq_config(cls))
            cls.s.fit(cls.texts * 10, cls.labels * 10)
            cls.s.save(cls.sequence_labeler_path)
        
        #train and save classifier for later use
        train_sample = cls.classifier_dataset.sample(n=cls.n_sample*10)
        try:
            cls.cl = Classifier.load(cls.classifier_path)
        except FileNotFoundError:
            cls.cl = Classifier(**cls.default_config(cls))
            cls.cl.fit(train_sample.Text, train_sample.Target)
            cls.cl.save(cls.classifier_path)

        if cls.do_comparison:
            #train and save comparison regressor for use
            cls.cr = ComparisonRegressor()
    
            n_per = 150
            similar = []
            different = []
            for dataset in [cls.animals, cls.numbers]:
                for i in range(n_per // 2):
                    similar.append([random.choice(dataset), random.choice(dataset)])
            for i in range(n_per):
                different.append([random.choice(cls.animals), random.choice(cls.numbers)])

            targets = np.asarray([1] * len(similar) + [0] * len(different))
            data = similar + different

            cls.x_tr, cls.x_te, cls.t_tr, cls.t_te = train_test_split(data, targets, test_size=0.3, random_state=42)
            
            try:
                cls.cr = ComparisonRegressor.load(cls.comparison_regressor_path, **cls.default_config(cls))
            except FileNotFoundError:
                cls.cr = ComparisonRegressor(**cls.default_config(cls))
                cls.cr.fit(cls.x_tr, cls.t_tr)
                cls.cr.save(cls.comparison_regressor_path)
コード例 #10
0
    def test_fit_predict(self):
        """
        Ensure model training does not error out
        Ensure model returns predictions
        Ensure class reweighting behaves as intended
        """
        raw_docs = ["".join(text) for text in self.texts]
        texts, annotations = finetune_to_indico_sequence(
            raw_docs, self.texts, self.labels, none_value=self.model.config.pad_token
        )
        train_texts, test_texts, train_annotations, test_annotations = train_test_split(
            texts, annotations, test_size=0.1
        )

        reweighted_model = SequenceLabeler(
            **self.default_config(class_weights={"Named Entity": 100.0})
        )
        reweighted_model.fit(train_texts, train_annotations)
        reweighted_predictions = reweighted_model.predict(test_texts)
        reweighted_token_recall = sequence_labeling_token_recall(
            test_annotations, reweighted_predictions
        )

        self.model.fit(train_texts, train_annotations)
        predictions = self.model.predict(test_texts)
        probas = self.model.predict_proba(test_texts)

        self.assertIsInstance(probas, list)
        self.assertIsInstance(probas[0], list)
        self.assertIsInstance(probas[0][0], dict)
        self.assertIsInstance(probas[0][0]["confidence"], dict)

        token_precision = sequence_labeling_token_precision(
            test_annotations, predictions
        )
        token_recall = sequence_labeling_token_recall(test_annotations, predictions)
        overlap_precision = sequence_labeling_overlap_precision(
            test_annotations, predictions
        )
        overlap_recall = sequence_labeling_overlap_recall(test_annotations, predictions)

        self.assertIn("Named Entity", token_precision)
        self.assertIn("Named Entity", token_recall)
        self.assertIn("Named Entity", overlap_precision)
        self.assertIn("Named Entity", overlap_recall)

        self.model.save(self.save_file)

        self.assertGreater(
            reweighted_token_recall["Named Entity"], token_recall["Named Entity"]
        )
コード例 #11
0
ファイル: test_utils.py プロジェクト: seeker1943/finetune
    def test_train_test_tokenization_consistency(self):
        filepath = os.path.abspath(
            os.path.join(os.path.dirname(__file__), 'data', 'testdata.csv'))
        df = pd.read_csv(filepath)
        X = []
        Y = []

        for i, row in df.iterrows():
            X.append(row["text"])
            labels = json.loads(row["question_843"])
            for label in labels:
                label['start'] = label['startOffset']
                label['end'] = label['endOffset']
                label['text'] = row["text"][label['start']:label['end']]
            Y.append(labels)

        for multilabel_setting in [True, False]:
            for base_model in [GPT, GPT2, BERT]:
                model = SequenceLabeler(
                    chunk_long_sequences=True,
                    base_model=base_model,
                    multi_label_sequences=multilabel_setting)
                train_encoded = [
                    x for x in model.input_pipeline._text_to_ids(
                        X, Y=Y, pad_token=model.config.pad_token)
                ]
                test_encoded = [
                    x for x in model.input_pipeline._text_to_ids(X)
                ]
                for chunk_id in range(len(train_encoded)):
                    for train_token_ids, test_token_ids in zip(
                            train_encoded[chunk_id].token_ids,
                            test_encoded[chunk_id].token_ids):
                        self.assertEqual(train_token_ids[0], test_token_ids[0])
コード例 #12
0
ファイル: test_sequence.py プロジェクト: kkleidal/finetune
 def test_fit_lm_only(self):
     """
     Ensure model training does not error out
     Ensure model returns predictions
     """
     raw_docs = ["".join(text) for text in self.texts]
     texts, annotations = finetune_to_indico_sequence(
         raw_docs, self.texts, self.labels)
     train_texts, test_texts, train_annotations, test_annotations = train_test_split(
         texts, annotations, test_size=0.1)
     self.model.fit(train_texts)
     self.model.fit(train_texts, train_annotations)
     predictions = self.model.predict(test_texts)
     probas = self.model.predict_proba(test_texts)
     self.assertIsInstance(probas, list)
     self.assertIsInstance(probas[0], list)
     self.assertIsInstance(probas[0][0], dict)
     self.assertIsInstance(probas[0][0]['confidence'], dict)
     token_precision = sequence_labeling_token_precision(
         test_annotations, predictions)
     token_recall = sequence_labeling_token_recall(test_annotations,
                                                   predictions)
     overlap_precision = sequence_labeling_overlap_precision(
         test_annotations, predictions)
     overlap_recall = sequence_labeling_overlap_recall(
         test_annotations, predictions)
     self.assertIn('Named Entity', token_precision)
     self.assertIn('Named Entity', token_recall)
     self.assertIn('Named Entity', overlap_precision)
     self.assertIn('Named Entity', overlap_recall)
     self.model.save(self.save_file)
     model = SequenceLabeler.load(self.save_file)
     predictions = model.predict(test_texts)
コード例 #13
0
 def test_auxiliary_sequence_labeler(self):
     """
     Ensure model training does not error out
     Ensure model returns reasonable predictions
     """
     (trainX, testX, trainY, testY) = self.dataset
     model = SequenceLabeler(**self.default_config())
     model.fit(trainX, trainY)
     preds = model.predict(testX)
     token_precision = sequence_labeling_token_precision(preds, testY)
     token_recall = sequence_labeling_token_recall(preds, testY)
     self.assertIn("Named Entity", token_precision)
     self.assertIn("Named Entity", token_recall)
     token_precision = np.mean(list(token_precision.values()))
     token_recall = np.mean(list(token_recall.values()))
     self.assertGreater(token_precision, 0.6)
     self.assertGreater(token_recall, 0.6)
コード例 #14
0
ファイル: doc_rep.py プロジェクト: flippersmcgee/Enso
class RoBERTaSeqLab(SidekickSeqLab):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.model_config = dict(
            use_auxiliary_info = False,
            n_layers_with_aux = 0,
            context_in_base_model = False,
            context_dim = 0
        )
        self.model_config.update(kwargs)
        self.model = SequenceLabeler(**self.model_config)

    def fit(self, X, y):
        text, context = zip(*X)
        self.model.fit(text, y)

    def predict(self, X, **kwargs):
        text, context = zip(*X)
        return self.model.predict(text)
コード例 #15
0
ファイル: rationalized.py プロジェクト: fagan2888/Enso
class FinetuneSeqBaselineRationalized(ClassificationExperiment):
    param_grid = {}

    def __init__(self, *args, **kwargs):
        """Initialize internal classifier."""
        super().__init__(auto_resample=False, *args, **kwargs)
        self.model = SequenceLabeler(val_size=0)

    def fit(self, X, y):
        targets = []
        for x, l in zip(X, y):
            if l[0]:
                targets.append([{**label, "label": l[1]} for label in l[0]])
            else:
                targets.append([{
                    "start": 0,
                    "end": len(x),
                    "label": l[1],
                    "text": x
                }])
        idxs, _ = self.resample(list(range(len(X))), [yi[1] for yi in y])
        train_x = []
        train_y = []
        for i in idxs:
            train_x.append(X[i])
            train_y.append(targets[i])
        self.model.fit(train_x, train_y)

    def predict(self, X, **kwargs):
        preds = self.model.predict_proba(X)
        classes = self.model.input_pipeline.label_encoder.classes_[:]
        classes.remove("<PAD>")
        output = []

        for sample in preds:
            output.append({
                k: safe_mean([s["confidence"][k] for s in sample]) + 1e-10
                for k in classes
            })
        return pd.DataFrame.from_records(output)

    def cleanup(self):
        del self.model
コード例 #16
0
ファイル: test_sequence.py プロジェクト: takuma-ynd/finetune
 def test_fit_predict(self):
     """
     Ensure model training does not error out
     Ensure model returns predictions
     """
     raw_docs = ["".join(text) for text in self.texts]
     texts, annotations = finetune_to_indico_sequence(raw_docs, self.texts, self.labels)
     train_texts, test_texts, train_annotations, test_annotations = train_test_split(texts, annotations)
     self.model.fit(train_texts, train_annotations)
     predictions = self.model.predict(test_texts)
     self.model.save(self.save_file)
     model = SequenceLabeler.load(self.save_file)
     predictions = model.predict(test_texts)
コード例 #17
0
ファイル: doc_rep.py プロジェクト: flippersmcgee/Enso
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.model_config.update(dict(
            pos_injection=True,
            n_layers_with_aux = 0,
            context_in_base_model = False
        ))
        self.model_config.update(kwargs)
        self.model = SequenceLabeler(**self.model_config)
            


            
        
        
        
        
        
        
        
コード例 #18
0
ファイル: test_sequence.py プロジェクト: kkleidal/finetune
class TestSequenceLabeler(unittest.TestCase):

    n_sample = 100
    n_hidden = 768
    dataset_path = os.path.join('Data', 'Sequence', 'reuters.xml')
    processed_path = os.path.join('Data', 'Sequence', 'reuters.json')

    @classmethod
    def _download_reuters(cls):
        """
        Download Stanford Sentiment Treebank to enso `data` directory
        """
        path = Path(cls.dataset_path)
        if not path.exists():
            path.parent.mkdir(parents=True, exist_ok=True)

        if not os.path.exists(cls.dataset_path):
            url = "https://raw.githubusercontent.com/dice-group/n3-collection/master/reuters.xml"
            r = requests.get(url)
            with open(cls.dataset_path, "wb") as fp:
                fp.write(r.content)

        with codecs.open(cls.dataset_path, "r", "utf-8") as infile:
            soup = bs(infile, "html5lib")

        docs = []
        docs_labels = []
        for elem in soup.find_all("document"):
            texts = []
            labels = []

            # Loop through each child of the element under "textwithnamedentities"
            for c in elem.find("textwithnamedentities").children:
                if type(c) == Tag:
                    if c.name == "namedentityintext":
                        label = "Named Entity"  # part of a named entity
                    else:
                        label = "<PAD>"  # irrelevant word
                    texts.append(c.text)
                    labels.append(label)

            docs.append(texts)
            docs_labels.append(labels)

        with open(cls.processed_path, 'wt') as fp:
            json.dump((docs, docs_labels), fp)

    @classmethod
    def setUpClass(cls):
        cls._download_reuters()

    def setUp(self):
        self.save_file = 'tests/saved-models/test-save-load'

        with open(self.processed_path, 'rt') as fp:
            self.texts, self.labels = json.load(fp)

        tf.reset_default_graph()

        self.model = SequenceLabeler(batch_size=2,
                                     max_length=256,
                                     lm_loss_coef=0.0,
                                     verbose=False)

    def test_fit_lm_only(self):
        """
        Ensure model training does not error out
        Ensure model returns predictions
        """
        raw_docs = ["".join(text) for text in self.texts]
        texts, annotations = finetune_to_indico_sequence(
            raw_docs, self.texts, self.labels)
        train_texts, test_texts, train_annotations, test_annotations = train_test_split(
            texts, annotations, test_size=0.1)
        self.model.fit(train_texts)
        self.model.fit(train_texts, train_annotations)
        predictions = self.model.predict(test_texts)
        probas = self.model.predict_proba(test_texts)
        self.assertIsInstance(probas, list)
        self.assertIsInstance(probas[0], list)
        self.assertIsInstance(probas[0][0], dict)
        self.assertIsInstance(probas[0][0]['confidence'], dict)
        token_precision = sequence_labeling_token_precision(
            test_annotations, predictions)
        token_recall = sequence_labeling_token_recall(test_annotations,
                                                      predictions)
        overlap_precision = sequence_labeling_overlap_precision(
            test_annotations, predictions)
        overlap_recall = sequence_labeling_overlap_recall(
            test_annotations, predictions)
        self.assertIn('Named Entity', token_precision)
        self.assertIn('Named Entity', token_recall)
        self.assertIn('Named Entity', overlap_precision)
        self.assertIn('Named Entity', overlap_recall)
        self.model.save(self.save_file)
        model = SequenceLabeler.load(self.save_file)
        predictions = model.predict(test_texts)

    def test_fit_predict(self):
        """
        Ensure model training does not error out
        Ensure model returns predictions
        """
        raw_docs = ["".join(text) for text in self.texts]
        texts, annotations = finetune_to_indico_sequence(
            raw_docs, self.texts, self.labels)
        train_texts, test_texts, train_annotations, test_annotations = train_test_split(
            texts, annotations, test_size=0.1)
        self.model.fit(train_texts, train_annotations)
        predictions = self.model.predict(test_texts)
        probas = self.model.predict_proba(test_texts)
        self.assertIsInstance(probas, list)
        self.assertIsInstance(probas[0], list)
        self.assertIsInstance(probas[0][0], dict)
        self.assertIsInstance(probas[0][0]['confidence'], dict)
        token_precision = sequence_labeling_token_precision(
            test_annotations, predictions)
        token_recall = sequence_labeling_token_recall(test_annotations,
                                                      predictions)
        overlap_precision = sequence_labeling_overlap_precision(
            test_annotations, predictions)
        overlap_recall = sequence_labeling_overlap_recall(
            test_annotations, predictions)
        self.assertIn('Named Entity', token_precision)
        self.assertIn('Named Entity', token_recall)
        self.assertIn('Named Entity', overlap_precision)
        self.assertIn('Named Entity', overlap_recall)
        self.model.save(self.save_file)
        model = SequenceLabeler.load(self.save_file)
        predictions = model.predict(test_texts)

    def test_reasonable_predictions(self):
        test_sequence = [
            "I am a dog. A dog that's incredibly bright. I can talk, read, and write!"
        ]
        path = os.path.join(os.path.dirname(__file__), "testdata.json")

        # test ValueError raised when raw text is passed along with character idxs and doesn't match
        with self.assertRaises(ValueError):
            self.model.fit(["Text about a dog."], [[{
                "start": 0,
                "end": 5,
                "text": "cat",
                "label": "dog"
            }]])

        with open(path, "rt") as fp:
            text, labels = json.load(fp)

        self.model.finetune(text * 10, labels * 10)

        predictions = self.model.predict(test_sequence)
        self.assertTrue(1 <= len(predictions[0]) <= 3)
        self.assertTrue(any(pred["text"] == "dog" for pred in predictions[0]))

        self.model.config.subtoken_predictions = True
        predictions = self.model.predict(test_sequence)
        self.assertTrue(1 <= len(predictions[0]) <= 3)
        self.assertTrue(any(pred["text"] == "dog" for pred in predictions[0]))

    def test_chunk_long_sequences(self):
        test_sequence = [
            "I am a dog. A dog that's incredibly bright. I can talk, read, and write!"
            * 10
        ]
        path = os.path.join(os.path.dirname(__file__), "testdata.json")

        # test ValueError raised when raw text is passed along with character idxs and doesn't match
        self.model.config.chunk_long_sequences = True
        self.model.config.max_length = 18
        with self.assertRaises(ValueError):
            self.model.fit(["Text about a dog."], [[{
                "start": 0,
                "end": 5,
                "text": "cat",
                "label": "dog"
            }]])

        with open(path, "rt") as fp:
            text, labels = json.load(fp)

        self.model.finetune(text * 10, labels * 10)

        predictions = self.model.predict(test_sequence)
        print(test_sequence)
        print(predictions)
        print(len(predictions))
        self.assertEqual(len(predictions[0]), 20)
        self.assertTrue(any(pred["text"] == "dog" for pred in predictions[0]))
コード例 #19
0
 def __init__(self, *args, **kwargs):
     """Initialize internal classifier."""
     super().__init__(auto_resample=False, *args, **kwargs)
     self.model = SequenceLabeler(val_size=0)
コード例 #20
0
        os.remove(XML_PATH)

        raw_texts = ["".join(doc) for doc in docs]
        texts, annotations = finetune_to_indico_sequence(
            raw_texts, docs, docs_labels)
        df = pd.DataFrame({
            'texts':
            texts,
            'annotations':
            [json.dumps(annotation) for annotation in annotations]
        })
        df.to_csv(DATA_PATH)


if __name__ == "__main__":
    dataset = Reuters(nrows=1000).dataframe
    dataset['annotations'] = [
        json.loads(annotation) for annotation in dataset['annotations']
    ]
    trainX, testX, trainY, testY = train_test_split(dataset.texts.values,
                                                    dataset.annotations.values,
                                                    test_size=0.3,
                                                    random_state=42)
    model = SequenceLabeler(verbose=False,
                            max_length=64,
                            chunk_long_sequences=True)
    model.fit(trainX, trainY)
    predictions = model.predict(testX)
    n_sample = 10
    for i in range(n_sample):
        print(testX[i], predictions[i])
コード例 #21
0
class TestSequenceLabeler(unittest.TestCase):

    n_sample = 100
    dataset_path = os.path.join('Data', 'Sequence', 'reuters.xml')
    processed_path = os.path.join('Data', 'Sequence', 'reuters.json')

    @classmethod
    def _download_reuters(cls):
        """
        Download Stanford Sentiment Treebank to enso `data` directory
        """
        path = Path(cls.dataset_path)
        if not path.exists():
            path.parent.mkdir(parents=True, exist_ok=True)

        if not os.path.exists(cls.dataset_path):
            url = "https://raw.githubusercontent.com/dice-group/n3-collection/master/reuters.xml"
            r = requests.get(url)
            with open(cls.dataset_path, "wb") as fp:
                fp.write(r.content)

        with codecs.open(cls.dataset_path, "r", "utf-8") as infile:
            soup = bs(infile, "html.parser")

        docs = []
        docs_labels = []
        for elem in soup.find_all("document"):
            texts = []
            labels = []

            # Loop through each child of the element under "textwithnamedentities"
            for c in elem.find("textwithnamedentities").children:
                if type(c) == Tag:
                    if c.name == "namedentityintext":
                        label = "Named Entity"  # part of a named entity
                    else:
                        label = "<PAD>"  # irrelevant word
                    texts.append(c.text)
                    labels.append(label)

            docs.append(texts)
            docs_labels.append(labels)

        with open(cls.processed_path, 'wt') as fp:
            json.dump((docs, docs_labels), fp)

    @classmethod
    def setUpClass(cls):
        cls._download_reuters()

    def default_config(self, **kwargs):
        d = dict(
            batch_size=2,
            max_length=256,
            lm_loss_coef=0.0,
            val_size=0,
            interpolate_pos_embed=False,
        )
        d.update(**kwargs)
        return d

    def setUp(self):
        self.save_file = 'tests/saved-models/test-save-load'
        random.seed(42)
        np.random.seed(42)
        with open(self.processed_path, 'rt') as fp:
            self.texts, self.labels = json.load(fp)

        self.model = SequenceLabeler(**self.default_config())

    @pytest.mark.skipif(
        SKIP_LM_TESTS,
        reason="Bidirectional models do not yet support LM functions")
    def test_fit_lm_only(self):
        """
        Ensure model training does not error out
        Ensure model returns predictions
        """
        raw_docs = ["".join(text) for text in self.texts]
        texts, annotations = finetune_to_indico_sequence(
            raw_docs,
            self.texts,
            self.labels,
            none_value=self.model.config.pad_token)
        train_texts, test_texts, train_annotations, test_annotations = train_test_split(
            texts, annotations, test_size=0.1)
        self.model.fit(train_texts)
        self.model.fit(train_texts, train_annotations)
        predictions = self.model.predict(test_texts)
        probas = self.model.predict_proba(test_texts)
        self.assertIsInstance(probas, list)
        self.assertIsInstance(probas[0], list)
        self.assertIsInstance(probas[0][0], dict)
        self.assertIsInstance(probas[0][0]['confidence'], dict)
        token_precision = sequence_labeling_token_precision(
            test_annotations, predictions)
        token_recall = sequence_labeling_token_recall(test_annotations,
                                                      predictions)
        overlap_precision = sequence_labeling_overlap_precision(
            test_annotations, predictions)
        overlap_recall = sequence_labeling_overlap_recall(
            test_annotations, predictions)
        self.assertIn('Named Entity', token_precision)
        self.assertIn('Named Entity', token_recall)
        self.assertIn('Named Entity', overlap_precision)
        self.assertIn('Named Entity', overlap_recall)
        self.model.save(self.save_file)
        model = SequenceLabeler.load(self.save_file)
        predictions = model.predict(test_texts)

    def test_fit_predict(self):
        """
        Ensure model training does not error out
        Ensure model returns predictions
        Ensure class reweighting behaves as intended
        """
        raw_docs = ["".join(text) for text in self.texts]
        texts, annotations = finetune_to_indico_sequence(
            raw_docs,
            self.texts,
            self.labels,
            none_value=self.model.config.pad_token)
        train_texts, test_texts, train_annotations, test_annotations = train_test_split(
            texts, annotations, test_size=0.1, random_state=42)

        reweighted_model = SequenceLabeler(**self.default_config(
            class_weights={'Named Entity': 10.}))
        reweighted_model.fit(train_texts, train_annotations)
        reweighted_predictions = reweighted_model.predict(test_texts)
        reweighted_token_recall = sequence_labeling_token_recall(
            test_annotations, reweighted_predictions)

        self.model.fit(train_texts, train_annotations)
        predictions = self.model.predict(test_texts)
        probas = self.model.predict_proba(test_texts)

        self.assertIsInstance(probas, list)
        self.assertIsInstance(probas[0], list)
        self.assertIsInstance(probas[0][0], dict)
        self.assertIsInstance(probas[0][0]['confidence'], dict)

        token_precision = sequence_labeling_token_precision(
            test_annotations, predictions)
        token_recall = sequence_labeling_token_recall(test_annotations,
                                                      predictions)
        overlap_precision = sequence_labeling_overlap_precision(
            test_annotations, predictions)
        overlap_recall = sequence_labeling_overlap_recall(
            test_annotations, predictions)

        self.assertIn('Named Entity', token_precision)
        self.assertIn('Named Entity', token_recall)
        self.assertIn('Named Entity', overlap_precision)
        self.assertIn('Named Entity', overlap_recall)

        self.model.save(self.save_file)

        self.assertGreater(reweighted_token_recall['Named Entity'],
                           token_recall['Named Entity'])

    def test_cached_predict(self):
        """
        Ensure model training does not error out
        Ensure model returns predictions
        """
        raw_docs = ["".join(text) for text in self.texts]
        texts, annotations = finetune_to_indico_sequence(
            raw_docs,
            self.texts,
            self.labels,
            none_value=self.model.config.pad_token)
        train_texts, test_texts, train_annotations, _ = train_test_split(
            texts, annotations, test_size=0.1)
        self.model.fit(train_texts, train_annotations)

        self.model.config.chunk_long_sequences = True
        self.model.config.max_length = 128

        uncached_preds = self.model.predict(test_texts[:1])

        with self.model.cached_predict():
            start = time.time()
            self.model.predict(test_texts[:1])
            first = time.time()
            self.model.predict(test_texts[:1])
            second = time.time()
            preds = self.model.predict(test_texts[:1])
            assert len(preds) == 1
            preds = self.model.predict(test_texts[:2])
            assert len(preds) == 2

        for uncached_pred, cached_pred in zip(uncached_preds, preds):
            self.assertEqual(str(uncached_pred), str(cached_pred))

        first_prediction_time = (first - start)
        second_prediction_time = (second - first)
        self.assertLess(second_prediction_time, first_prediction_time / 2.)

    def test_reasonable_predictions(self):
        test_sequence = [
            "I am a dog. A dog that's incredibly bright. I can talk, read, and write!"
        ]
        path = os.path.join(os.path.dirname(__file__), "testdata.json")

        # test ValueError raised when raw text is passed along with character idxs and doesn't match
        with self.assertRaises(ValueError):
            self.model.fit(["Text about a dog."], [[{
                "start": 0,
                "end": 5,
                "text": "cat",
                "label": "dog"
            }]])

        with open(path, "rt") as fp:
            text, labels = json.load(fp)

        self.model.fit(text * 10, labels * 10)

        predictions = self.model.predict(test_sequence)
        self.assertTrue(1 <= len(predictions[0]) <= 3)
        self.assertTrue(
            any(pred["text"].strip() == "dog" for pred in predictions[0]))

        predictions = self.model.predict(test_sequence)
        self.assertTrue(1 <= len(predictions[0]) <= 3)
        self.assertTrue(
            any(pred["text"].strip() == "dog" for pred in predictions[0]))

    def test_chunk_long_sequences(self):
        test_sequence = [
            "I am a dog. A dog that's incredibly bright. I can talk, read, and write! "
            * 10
        ]
        path = os.path.join(os.path.dirname(__file__), "testdata.json")

        # test ValueError raised when raw text is passed along with character idxs and doesn't match
        self.model.config.chunk_long_sequences = True
        self.model.config.max_length = 18
        with self.assertRaises(ValueError):
            self.model.fit(["Text about a dog."], [[{
                "start": 0,
                "end": 5,
                "text": "cat",
                "label": "dog"
            }]])

        with open(path, "rt") as fp:
            text, labels = json.load(fp)

        self.model.finetune(text * 10, labels * 10)

        predictions = self.model.predict(test_sequence)
        self.assertEqual(len(predictions[0]), 20)
        self.assertTrue(
            any(pred["text"].strip() == "dog" for pred in predictions[0]))

    def test_fit_predict_multi_model(self):
        """
        Ensure model training does not error out
        Ensure model returns predictions
        """
        self.model = SequenceLabeler(batch_size=2,
                                     max_length=256,
                                     lm_loss_coef=0.0,
                                     multi_label_sequences=True)
        raw_docs = ["".join(text) for text in self.texts]
        texts, annotations = finetune_to_indico_sequence(
            raw_docs,
            self.texts,
            self.labels,
            none_value=self.model.config.pad_token)
        train_texts, test_texts, train_annotations, _ = train_test_split(
            texts, annotations, test_size=0.1)
        self.model.fit(train_texts, train_annotations)
        self.model.predict(test_texts)
        probas = self.model.predict_proba(test_texts)
        self.assertIsInstance(probas, list)
        self.assertIsInstance(probas[0], list)
        self.assertIsInstance(probas[0][0], dict)
        self.assertIsInstance(probas[0][0]['confidence'], dict)
        self.model.save(self.save_file)
        model = SequenceLabeler.load(self.save_file)
        model.predict(test_texts)
コード例 #22
0
ファイル: reuters.py プロジェクト: bin2000/finetune
            for c in elem.find("textwithnamedentities").children:
                if type(c) == Tag:
                    if c.name == "namedentityintext":
                        label = "Named Entity"  # part of a named entity
                    else:
                        label = "<PAD>"  # irrelevant word
                    texts.append(c.text)
                    labels.append(label)

            docs.append(texts)
            docs_labels.append(labels)

        fd.close()
        os.remove(XML_PATH)

        raw_texts = ["".join(doc) for doc in docs]
        texts, annotations = finetune_to_indico_sequence(raw_texts, docs, docs_labels)
        df = pd.DataFrame({'texts': texts, 'annotations': [json.dumps(annotation) for annotation in annotations]})
        df.to_csv(DATA_PATH)


if __name__ == "__main__":
    dataset = Reuters(nrows=1000).dataframe
    dataset['annotations'] = [json.loads(annotation) for annotation in dataset['annotations']]
    trainX, testX, trainY, testY = train_test_split(dataset.texts.values, dataset.annotations.values, test_size=0.3, random_state=42)
    model = SequenceLabeler(verbose=False, max_length=64, chunk_long_sequences=True)
    model.fit(trainX, trainY)
    predictions = model.predict(testX)
    n_sample = 10
    for i in range(n_sample):
        print(testX[i], predictions[i])
コード例 #23
0
 def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
     self.model = SequenceLabeler(val_size=0)
コード例 #24
0
            none_value="<PAD>",
            subtoken_predictions=True)
        df = pd.DataFrame({
            'texts':
            texts,
            'annotations':
            [json.dumps(annotation) for annotation in annotations]
        })
        df.to_csv(DATA_PATH)


if __name__ == "__main__":
    dataset = Reuters().dataframe
    dataset['annotations'] = [
        json.loads(annotation) for annotation in dataset['annotations']
    ]
    trainX, testX, trainY, testY = train_test_split(dataset.texts.values,
                                                    dataset.annotations.values,
                                                    test_size=0.7,
                                                    random_state=42)
    model = SequenceLabeler(base_model=RoBERTa,
                            batch_size=1,
                            val_size=0.,
                            max_length=16,
                            chunk_long_sequences=True,
                            subtoken_predictions=True)
    model.fit(trainX, trainY)
    predictions = model.predict(testX)
    print(predictions)
    print(annotation_report(testY, predictions))
コード例 #25
0
class TestSequenceLabelerTextCNN(TestModelBase):
    n_sample = 100
    dataset_path = os.path.join("Data", "Sequence", "reuters.xml")
    processed_path = os.path.join("Data", "Sequence", "reuters.json")

    base_model = TextCNN

    @classmethod
    def _download_reuters(cls):
        """
        Download Reuters to test directory
        """
        path = Path(cls.dataset_path)
        if not path.exists():
            path.parent.mkdir(parents=True, exist_ok=True)

        if not os.path.exists(cls.dataset_path):
            url = "https://raw.githubusercontent.com/dice-group/n3-collection/master/reuters.xml"
            r = requests.get(url)
            with open(cls.dataset_path, "wb") as fp:
                fp.write(r.content)

        with codecs.open(cls.dataset_path, "r", "utf-8") as infile:
            soup = bs(infile, "html.parser")

        docs = []
        docs_labels = []
        for elem in soup.find_all("document"):
            texts = []
            labels = []

            # Loop through each child of the element under "textwithnamedentities"
            for c in elem.find("textwithnamedentities").children:
                if type(c) == Tag:
                    if c.name == "namedentityintext":
                        label = "Named Entity"  # part of a named entity
                    else:
                        label = "<PAD>"  # irrelevant word
                    texts.append(c.text)
                    labels.append(label)

            docs.append(texts)
            docs_labels.append(labels)

        with open(cls.processed_path, "wt") as fp:
            json.dump((docs, docs_labels), fp)

    @classmethod
    def setUpClass(cls):
        cls._download_reuters()

    def setUp(self):
        self.save_file = "tests/saved-models/test-save-load"
        random.seed(42)
        np.random.seed(42)
        with open(self.processed_path, "rt") as fp:
            self.texts, self.labels = json.load(fp)

        self.model = SequenceLabeler(**self.default_config())

    def test_fit_predict(self):
        """
        Ensure model training does not error out
        Ensure model returns predictions
        Ensure class reweighting behaves as intended
        """
        raw_docs = ["".join(text) for text in self.texts]
        texts, annotations = finetune_to_indico_sequence(
            raw_docs,
            self.texts,
            self.labels,
            none_value=self.model.config.pad_token)
        train_texts, test_texts, train_annotations, test_annotations = train_test_split(
            texts, annotations, test_size=0.1)

        self.model.fit(train_texts, train_annotations)
        predictions = self.model.predict(test_texts)
        probas = self.model.predict_proba(test_texts)

        self.assertIsInstance(probas, list)
        self.assertIsInstance(probas[0], list)
        self.assertIsInstance(probas[0][0], dict)
        self.assertIsInstance(probas[0][0]["confidence"], dict)

        token_precision = sequence_labeling_token_precision(
            test_annotations, predictions)
        token_recall = sequence_labeling_token_recall(test_annotations,
                                                      predictions)
        overlap_precision = sequence_labeling_overlap_precision(
            test_annotations, predictions)
        overlap_recall = sequence_labeling_overlap_recall(
            test_annotations, predictions)

        self.assertIn("Named Entity", token_precision)
        self.assertIn("Named Entity", token_recall)
        self.assertIn("Named Entity", overlap_precision)
        self.assertIn("Named Entity", overlap_recall)

        self.model.save(self.save_file)

    def test_cached_predict(self):
        """
        Ensure model training does not error out
        Ensure model returns predictions
        """
        raw_docs = ["".join(text) for text in self.texts]
        texts, annotations = finetune_to_indico_sequence(
            raw_docs,
            self.texts,
            self.labels,
            none_value=self.model.config.pad_token)
        train_texts, test_texts, train_annotations, _ = train_test_split(
            texts, annotations, test_size=0.1)
        self.model.fit(train_texts, train_annotations)
        with self.model.cached_predict():
            self.model.predict(test_texts)
            self.model.predict(test_texts)

    def test_fit_predict_multi_model(self):
        """
        Ensure model training does not error out
        Ensure model returns predictions
        """
        self.model = SequenceLabeler(**self.default_config(
            batch_size=2,
            max_length=256,
            lm_loss_coef=0.0,
            multi_label_sequences=True,
        ))
        raw_docs = ["".join(text) for text in self.texts]
        texts, annotations = finetune_to_indico_sequence(
            raw_docs,
            self.texts,
            self.labels,
            none_value=self.model.config.pad_token)
        train_texts, test_texts, train_annotations, _ = train_test_split(
            texts, annotations, test_size=0.1)
        self.model.fit(train_texts, train_annotations)
        self.model.predict(test_texts)
        probas = self.model.predict_proba(test_texts)
        self.assertIsInstance(probas, list)
        self.assertIsInstance(probas[0], list)
        self.assertIsInstance(probas[0][0], dict)
        self.assertIsInstance(probas[0][0]["confidence"], dict)
        self.model.save(self.save_file)
        model = SequenceLabeler.load(self.save_file)
        model.predict(test_texts)
コード例 #26
0
        fd.close()
        os.remove(XML_PATH)

        raw_texts = ["".join(doc) for doc in docs]
        texts, annotations = finetune_to_indico_sequence(
            raw_texts, docs, docs_labels)
        df = pd.DataFrame({
            'texts':
            texts,
            'annotations':
            [json.dumps(annotation) for annotation in annotations]
        })
        df.to_csv(DATA_PATH)


if __name__ == "__main__":
    dataset = Reuters().dataframe
    dataset['annotations'] = [
        json.loads(annotation) for annotation in dataset['annotations']
    ]
    trainX, testX, trainY, testY = train_test_split(dataset.texts.values,
                                                    dataset.annotations.values,
                                                    test_size=0.3,
                                                    random_state=42)
    model = SequenceLabeler(batch_size=2,
                            val_size=0.,
                            chunk_long_sequences=True)
    model.fit(trainX, trainY)
    predictions = model.predict(testX)
    print(annotation_report(testY, predictions))
コード例 #27
0
ファイル: reuters.py プロジェクト: wavelets/finetune
        fd.close()
        os.remove(XML_PATH)

        raw_texts = ["".join(doc) for doc in docs]
        texts, annotations = finetune_to_indico_sequence(
            raw_texts, docs, docs_labels)
        df = pd.DataFrame({
            'texts':
            texts,
            'annotations':
            [json.dumps(annotation) for annotation in annotations]
        })
        df.to_csv(DATA_PATH)


if __name__ == "__main__":
    dataset = Reuters(nrows=1000).dataframe
    dataset['annotations'] = [
        json.loads(annotation) for annotation in dataset['annotations']
    ]
    trainX, testX, trainY, testY = train_test_split(dataset.texts,
                                                    dataset.annotations,
                                                    test_size=0.3,
                                                    random_state=42)
    model = SequenceLabeler(verbose=False)
    model.fit(trainX, trainY)
    predictions = model.predict(testX)
    n_sample = 10
    for i in range(n_sample):
        print(testX.values[i], predictions[i])
コード例 #28
0
 def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
     self.model_config = dict(val_size=0)
     self.model_config.update(kwargs)
     self.model = SequenceLabeler(**self.model_config)
コード例 #29
0
ファイル: test_textcnn.py プロジェクト: RossSong/finetune
class TestSequenceLabeler(unittest.TestCase):
    n_sample = 100
    dataset_path = os.path.join(
        'Data', 'Sequence', 'reuters.xml'
    )
    processed_path = os.path.join('Data', 'Sequence', 'reuters.json')

    @classmethod
    def _download_reuters(cls):
        """
        Download Stanford Sentiment Treebank to enso `data` directory
        """
        path = Path(cls.dataset_path)
        if not path.exists():
            path.parent.mkdir(parents=True, exist_ok=True)

        if not os.path.exists(cls.dataset_path):
            url = "https://raw.githubusercontent.com/dice-group/n3-collection/master/reuters.xml"
            r = requests.get(url)
            with open(cls.dataset_path, "wb") as fp:
                fp.write(r.content)

        with codecs.open(cls.dataset_path, "r", "utf-8") as infile:
            soup = bs(infile, "html.parser")

        docs = []
        docs_labels = []
        for elem in soup.find_all("document"):
            texts = []
            labels = []

            # Loop through each child of the element under "textwithnamedentities"
            for c in elem.find("textwithnamedentities").children:
                if type(c) == Tag:
                    if c.name == "namedentityintext":
                        label = "Named Entity"  # part of a named entity
                    else:
                        label = "<PAD>"  # irrelevant word
                    texts.append(c.text)
                    labels.append(label)

            docs.append(texts)
            docs_labels.append(labels)


        with open(cls.processed_path, 'wt') as fp:
            json.dump((docs, docs_labels), fp)


    @classmethod
    def setUpClass(cls):
        cls._download_reuters()

    def default_config(self, **kwargs):
        d = dict(
            batch_size=2,
            max_length=256,
            lm_loss_coef=0.0,
            val_size=0,
            interpolate_pos_embed=False,
        )
        d.update(**kwargs)
        return d

    def setUp(self):
        self.save_file = 'tests/saved-models/test-save-load'
        random.seed(42)
        np.random.seed(42)
        with open(self.processed_path, 'rt') as fp:
            self.texts, self.labels = json.load(fp)

        self.model = SequenceLabeler(
            **default_config()
        )

    def test_fit_predict(self):
        """
        Ensure model training does not error out
        Ensure model returns predictions
        Ensure class reweighting behaves as intended
        """
        raw_docs = ["".join(text) for text in self.texts]
        texts, annotations = finetune_to_indico_sequence(
            raw_docs,
            self.texts,
            self.labels,
            encoder=self.model.input_pipeline.text_encoder,
            none_value=self.model.config.pad_token
        )
        train_texts, test_texts, train_annotations, test_annotations = train_test_split(
            texts, annotations, test_size=0.1
        )

        reweighted_model = SequenceLabeler(
            **default_config(class_weights={'Named Entity': 10.})
        )
        reweighted_model.fit(train_texts, train_annotations)
        reweighted_predictions = reweighted_model.predict(test_texts)
        reweighted_token_recall = sequence_labeling_token_recall(test_annotations, reweighted_predictions)

        self.model.fit(train_texts, train_annotations)
        predictions = self.model.predict(test_texts)
        probas = self.model.predict_proba(test_texts)

        self.assertIsInstance(probas, list)
        self.assertIsInstance(probas[0], list)
        self.assertIsInstance(probas[0][0], dict)
        self.assertIsInstance(probas[0][0]['confidence'], dict)

        token_precision = sequence_labeling_token_precision(test_annotations, predictions)
        token_recall = sequence_labeling_token_recall(test_annotations, predictions)
        overlap_precision = sequence_labeling_overlap_precision(test_annotations, predictions)
        overlap_recall = sequence_labeling_overlap_recall(test_annotations, predictions)

        self.assertIn('Named Entity', token_precision)
        self.assertIn('Named Entity', token_recall)
        self.assertIn('Named Entity', overlap_precision)
        self.assertIn('Named Entity', overlap_recall)

        self.model.save(self.save_file)

        self.assertGreater(reweighted_token_recall['Named Entity'], token_recall['Named Entity'])

    def test_cached_predict(self):
        """
        Ensure model training does not error out
        Ensure model returns predictions
        """
        raw_docs = ["".join(text) for text in self.texts]
        texts, annotations = finetune_to_indico_sequence(
            raw_docs,
            self.texts,
            self.labels,
            encoder=self.model.input_pipeline.text_encoder,
            none_value=self.model.config.pad_token
        )
        train_texts, test_texts, train_annotations, _ = train_test_split(texts, annotations, test_size=0.1)
        self.model.fit(train_texts, train_annotations)
        with self.model.cached_predict():
            self.model.predict(test_texts)
            self.model.predict(test_texts)

    def test_fit_predict_multi_model(self):
        """
        Ensure model training does not error out
        Ensure model returns predictions
        """
        self.model = SequenceLabeler(batch_size=2, max_length=256, lm_loss_coef=0.0, multi_label_sequences=True)
        raw_docs = ["".join(text) for text in self.texts]
        texts, annotations = finetune_to_indico_sequence(
            raw_docs,
            self.texts,
            self.labels,
            encoder=self.model.input_pipeline.text_encoder,
            none_value=self.model.config.pad_token
        )
        train_texts, test_texts, train_annotations, _ = train_test_split(texts, annotations, test_size=0.1)
        self.model.fit(train_texts, train_annotations)
        self.model.predict(test_texts)
        probas = self.model.predict_proba(test_texts)
        self.assertIsInstance(probas, list)
        self.assertIsInstance(probas[0], list)
        self.assertIsInstance(probas[0][0], dict)
        self.assertIsInstance(probas[0][0]['confidence'], dict)
        self.model.save(self.save_file)
        model = SequenceLabeler.load(self.save_file)
        model.predict(test_texts)
コード例 #30
0
ファイル: reuters.py プロジェクト: seeker1943/finetune
            subtoken_predictions=True)
        df = pd.DataFrame({
            'texts':
            texts,
            'annotations':
            [json.dumps(annotation) for annotation in annotations]
        })
        df.to_csv(DATA_PATH)


if __name__ == "__main__":
    dataset = Reuters().dataframe
    dataset['annotations'] = [
        json.loads(annotation) for annotation in dataset['annotations']
    ]
    trainX, testX, trainY, testY = train_test_split(dataset.texts.values,
                                                    dataset.annotations.values,
                                                    test_size=0.3,
                                                    random_state=42)
    model = SequenceLabeler(base_model=GPT2,
                            batch_size=2,
                            val_size=0.,
                            max_length=16,
                            chunk_long_sequences=True,
                            subtoken_predictions=True,
                            filter_empty_examples=True)
    model.fit(trainX, trainY)
    predictions = model.predict(testX)
    print(predictions)
    print(annotation_report(testY, predictions))
コード例 #31
0
ファイル: test_sequence.py プロジェクト: takuma-ynd/finetune
class TestSequenceLabeler(unittest.TestCase):

    n_sample = 100
    n_hidden = 768
    dataset_path = os.path.join(
        'Data', 'Sequence', 'reuters.xml'
    )
    processed_path = os.path.join('Data', 'Sequence', 'reuters.json')

    @classmethod
    def _download_reuters(cls):
        """
        Download Stanford Sentiment Treebank to enso `data` directory
        """
        path = Path(cls.dataset_path)
        if not path.exists():
            path.parent.mkdir(parents=True, exist_ok=True)

        if not os.path.exists(cls.dataset_path):
            url = "https://raw.githubusercontent.com/dice-group/n3-collection/master/reuters.xml"
            r = requests.get(url)
            with open(cls.dataset_path, "wb") as fp:
                fp.write(r.content)
        
        with codecs.open(cls.dataset_path, "r", "utf-8") as infile:
            soup = bs(infile, "html5lib")

        docs = []
        docs_labels = []
        for elem in soup.find_all("document"):
            texts = []
            labels = []

            # Loop through each child of the element under "textwithnamedentities"
            for c in elem.find("textwithnamedentities").children:
                if type(c) == Tag:
                    if c.name == "namedentityintext":
                        label = "Named Entity"  # part of a named entity
                    else:
                        label = "<PAD>"  # irrelevant word
                    texts.append(c.text)
                    labels.append(label)

            docs.append(texts)
            docs_labels.append(labels)

        
        with open(cls.processed_path, 'wt') as fp:
            json.dump((docs, docs_labels), fp)


    @classmethod
    def setUpClass(cls):
        cls._download_reuters()

    def setUp(self):
        self.save_file = 'tests/saved-models/test-save-load'

        with open(self.processed_path, 'rt') as fp:
            self.texts, self.labels = json.load(fp)
        
        tf.reset_default_graph()

        self.model = SequenceLabeler(batch_size=2, max_length=256, verbose=False)

    def test_fit_predict(self):
        """
        Ensure model training does not error out
        Ensure model returns predictions
        """
        raw_docs = ["".join(text) for text in self.texts]
        texts, annotations = finetune_to_indico_sequence(raw_docs, self.texts, self.labels)
        train_texts, test_texts, train_annotations, test_annotations = train_test_split(texts, annotations)
        self.model.fit(train_texts, train_annotations)
        predictions = self.model.predict(test_texts)
        self.model.save(self.save_file)
        model = SequenceLabeler.load(self.save_file)
        predictions = model.predict(test_texts)