def test_fit_lm_only(self): """ Ensure model training does not error out Ensure model returns predictions """ raw_docs = ["".join(text) for text in self.texts] texts, annotations = finetune_to_indico_sequence( raw_docs, self.texts, self.labels) train_texts, test_texts, train_annotations, test_annotations = train_test_split( texts, annotations, test_size=0.1) self.model.fit(train_texts) self.model.fit(train_texts, train_annotations) predictions = self.model.predict(test_texts) probas = self.model.predict_proba(test_texts) self.assertIsInstance(probas, list) self.assertIsInstance(probas[0], list) self.assertIsInstance(probas[0][0], dict) self.assertIsInstance(probas[0][0]['confidence'], dict) token_precision = sequence_labeling_token_precision( test_annotations, predictions) token_recall = sequence_labeling_token_recall(test_annotations, predictions) overlap_precision = sequence_labeling_overlap_precision( test_annotations, predictions) overlap_recall = sequence_labeling_overlap_recall( test_annotations, predictions) self.assertIn('Named Entity', token_precision) self.assertIn('Named Entity', token_recall) self.assertIn('Named Entity', overlap_precision) self.assertIn('Named Entity', overlap_recall) self.model.save(self.save_file) model = SequenceLabeler.load(self.save_file) predictions = model.predict(test_texts)
def test_fit_predict_multi_model(self): """ Ensure model training does not error out Ensure model returns predictions """ self.model = SequenceLabeler(batch_size=2, max_length=256, lm_loss_coef=0.0, multi_label_sequences=True) raw_docs = ["".join(text) for text in self.texts] texts, annotations = finetune_to_indico_sequence( raw_docs, self.texts, self.labels, none_value=self.model.config.pad_token) train_texts, test_texts, train_annotations, _ = train_test_split( texts, annotations, test_size=0.1) self.model.fit(train_texts, train_annotations) self.model.predict(test_texts) probas = self.model.predict_proba(test_texts) self.assertIsInstance(probas, list) self.assertIsInstance(probas[0], list) self.assertIsInstance(probas[0][0], dict) self.assertIsInstance(probas[0][0]['confidence'], dict) self.model.save(self.save_file) model = SequenceLabeler.load(self.save_file) model.predict(test_texts)
def setUpClass(cls): cls._download_data() #dataset preparation cls.classifier_dataset = pd.read_csv(cls.classifier_dataset_path, nrows=cls.n_sample * 10) path = os.path.join(os.path.dirname(__file__), "data", "testdata.json") with open(path, 'rt') as fp: cls.texts, cls.labels = json.load(fp) cls.animals = ["dog", "cat", "horse", "cow", "pig", "sheep", "goat", "chicken", "guinea pig", "donkey", "turkey", "duck", "camel", "goose", "llama", "rabbit", "fox"] cls.numbers = ["one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen"] #train and save sequence labeler for later use try: cls.s = SequenceLabeler.load(cls.sequence_labeler_path, **cls.default_seq_config(cls)) except FileNotFoundError: cls.s = SequenceLabeler(**cls.default_seq_config(cls)) cls.s.fit(cls.texts * 10, cls.labels * 10) cls.s.save(cls.sequence_labeler_path) #train and save classifier for later use train_sample = cls.classifier_dataset.sample(n=cls.n_sample*10) try: cls.cl = Classifier.load(cls.classifier_path) except FileNotFoundError: cls.cl = Classifier(**cls.default_config(cls)) cls.cl.fit(train_sample.Text, train_sample.Target) cls.cl.save(cls.classifier_path) if cls.do_comparison: #train and save comparison regressor for use cls.cr = ComparisonRegressor() n_per = 150 similar = [] different = [] for dataset in [cls.animals, cls.numbers]: for i in range(n_per // 2): similar.append([random.choice(dataset), random.choice(dataset)]) for i in range(n_per): different.append([random.choice(cls.animals), random.choice(cls.numbers)]) targets = np.asarray([1] * len(similar) + [0] * len(different)) data = similar + different cls.x_tr, cls.x_te, cls.t_tr, cls.t_te = train_test_split(data, targets, test_size=0.3, random_state=42) try: cls.cr = ComparisonRegressor.load(cls.comparison_regressor_path, **cls.default_config(cls)) except FileNotFoundError: cls.cr = ComparisonRegressor(**cls.default_config(cls)) cls.cr.fit(cls.x_tr, cls.t_tr) cls.cr.save(cls.comparison_regressor_path)
def test_fit_predict(self): """ Ensure model training does not error out Ensure model returns predictions """ raw_docs = ["".join(text) for text in self.texts] texts, annotations = finetune_to_indico_sequence(raw_docs, self.texts, self.labels) train_texts, test_texts, train_annotations, test_annotations = train_test_split(texts, annotations) self.model.fit(train_texts, train_annotations) predictions = self.model.predict(test_texts) self.model.save(self.save_file) model = SequenceLabeler.load(self.save_file) predictions = model.predict(test_texts)