def main(): parser = argparse.ArgumentParser( description='Inter-dataset agreement calculator') parser.add_argument('gold_directory', help='First data folder path (gold)') parser.add_argument('system_directory', help='Second data folder path (system)') parser.add_argument('-m', '--mode', default='strict', help='strict or lenient (defaults to strict)') parser.add_argument( '-f', '--format', default='plain', help= 'format to print the table (options include grid, github, and latex)') parser.add_argument('-d', '--decimal', type=int, default=3, help='number of decimal places to round to') args = parser.parse_args() gold_dataset = Dataset(args.gold_directory) system_dataset = Dataset(args.system_directory) result = measure_dataset(gold_dataset, system_dataset, args.mode) output = format_results(result, num_dec=args.decimal, table_format=args.format) print(output)
def setUpClass(cls): cls.dataset = Dataset(os.path.join(test_dir, 'sample_dataset_1'), data_limit=1) cls.entities = cls.dataset.get_labels(as_list=True) cls.prediction_directory = tempfile.mkdtemp( ) # Directory to store predictions cls.batch_size = 3
def main(): parser = argparse.ArgumentParser(description="Display which annotations in a dataset overlap") parser.add_argument("dataset", help="Directory of the dataset") args = parser.parse_args() dataset = Dataset(args.dataset) calculate_dataset_overlap(dataset)
def setUpClass(cls): cls.dataset = Dataset(os.path.join(test_dir, 'sample_dataset_1')) cls.entities = cls.dataset.get_labels(as_list=True) cls.prediction_directory = tempfile.mkdtemp( ) # directory to store predictions cls.prediction_directory_2 = tempfile.mkdtemp() cls.pipeline = TestingPipeline(entities=cls.entities)
def test_cross_validate_create_groundtruth_predictions(self): """ Tests that during cross validation, the medaCy groundtruth (that is, the version of the training dataset used by medaCy) is written as well as the predictions that are created for each fold """ model = Model(self.pipeline) model.cross_validate(self.dataset, num_folds=2, prediction_directory=self.prediction_directory_3, groundtruth_directory=self.groundtruth_directory) prediction_dataset = Dataset(self.prediction_directory_3) groundtruth_dataset = Dataset(self.groundtruth_directory) for d in [prediction_dataset, groundtruth_dataset]: self.assertIsInstance(d, Dataset) original_file_names = {d.file_name for d in self.dataset} prediction_file_names = {d.file_name for d in prediction_dataset} groundtruth_file_names = {d.file_name for d in groundtruth_dataset} for n in [prediction_file_names, groundtruth_file_names]: self.assertSetEqual(n, original_file_names) # Container for all Annotations in all files in all folds all_anns_all_folds_actual = Annotations([]) # Test that fold groundtruth is written to file for fold_name in ["fold_1", "fold_2"]: fold_dataset = Dataset(groundtruth_dataset.data_directory / fold_name) for d in fold_dataset: fold_ann = Annotations(d.ann_path) groundtruth_ann = groundtruth_dataset[d.file_name] # Test that the entities in the fold groundtruth are a subset of the whole for that file self.assertTrue(set(fold_ann) <= set(groundtruth_ann)) all_anns_all_folds_actual |= fold_ann # Container for all annotations pulled directly from the groundtruth dataset all_groundtruth_tuples = Annotations([]) for ann in groundtruth_dataset.generate_annotations(): all_groundtruth_tuples |= ann expected = set(all_groundtruth_tuples) actual = set(all_anns_all_folds_actual) self.assertSetEqual(expected, actual)
def setUpClass(cls): cls.dataset = Dataset(os.path.join(test_dir, 'sample_dataset_1')) cls.prediction_directory = tempfile.mkdtemp( ) # Set up predict directory cls.entities = cls.dataset.get_labels(as_list=True) cls.ann_files = [] # Fill directory of prediction files (only the text files) for data_file in cls.dataset: new_file_path = os.path.join(cls.prediction_directory, data_file.file_name + '.txt') shutil.copyfile(data_file.txt_path, new_file_path)
def setUpClass(cls): """Loads sample dataset and sets up a temporary directory for IO tests""" cls.test_dir = tempfile.mkdtemp() # set up temp directory cls.sample_data_dir = os.path.join(test_dir, 'sample_dataset_1') cls.dataset = Dataset(cls.sample_data_dir) cls.entities = cls.dataset.get_labels(as_list=True) with open(os.path.join(cls.test_dir, "broken_ann_file.ann"), 'w') as f: f.write("This is clearly not a valid ann file") cls.ann_path_1 = cls.dataset.data_files[0].ann_path cls.ann_path_2 = cls.dataset.data_files[1].ann_path
def predict_directory(self, data_directory, prediction_directory): """ Predicts over all txt files in a directory using every Model. Note that this method spends a lot of time on file IO because each txt file is opened as many times as there are models. :param data_directory: Path to a directory of text files to predict over :param prediction_directory: a directory to write predictions to :return: a Dataset of the predictions """ if not os.path.isdir(data_directory): raise ValueError( f"'data_directory' must be an existing directory, but is '{repr(data_directory)}'" ) if not os.path.isdir(prediction_directory): raise ValueError( f"'prediction_directory' must be a directory, but is '{repr(prediction_directory)}'" ) # Get all the txt files in the input directory txt_files = [ f for f in os.listdir(data_directory) if f.endswith('.txt') ] # Create a dictionary of empty Annotations objects to store the predictions annotation_dict = { f: Annotations([], source_text_path=f) for f in txt_files } for model in self: for file_name in txt_files: file_path = os.path.join(data_directory, file_name) with open(file_path) as f: text = f.read() this_annotations = annotation_dict[file_name] resulting_annotations = model.predict(text) # Merge the two Annotations together and store them back in the dictionary annotation_dict[ file_name] = this_annotations | resulting_annotations # Create the new Dataset directory for path, ann in annotation_dict.items(): # Get the name of the output ann file path = os.path.join(data_directory, path) base_name = os.path.basename(path)[:-4] output_ann = os.path.join(prediction_directory, base_name + '.ann') output_txt = os.path.join(prediction_directory, base_name + '.txt') # Write the ann file ann.to_ann(output_ann) # Copy the txt file copyfile(path, output_txt) return Dataset(prediction_directory)
def test_fit_predict_dump_load(self): """Fits a model, tests that it predicts correctly, dumps and loads it, then tests that it still predicts""" model = Model(self.pipeline) # Test attempting to predict before fitting with self.assertRaises(RuntimeError): model.predict('Lorem ipsum dolor sit amet.') model.fit(self.dataset, groundtruth_directory=self.groundtruth_2_directory) # Test X and y data are set self.assertTrue(model.X_data) self.assertTrue(model.y_data) # Test that there is at least one prediction resulting_ann = model.predict( 'To exclude the possibility that alterations in PSSD might be a consequence of changes in the volume of reference, we used a subset of the vibratome sections' ) self.assertIsInstance(resulting_ann, Annotations) self.assertTrue(resulting_ann) # Test prediction over directory resulting_dataset = model.predict( self.dataset.data_directory, prediction_directory=self.prediction_directory) self.assertIsInstance(resulting_dataset, Dataset) self.assertEqual(len(self.dataset), len(resulting_dataset)) # Test that groundtruth is written groundtruth_dataset = Dataset(self.groundtruth_2_directory) expected = [d.file_name for d in self.dataset] actual = [d.file_name for d in groundtruth_dataset] self.assertListEqual(expected, actual) # Test that the groundtruth ann files have content for ann in groundtruth_dataset.generate_annotations(): self.assertTrue(ann) # Test pickling a model pickle_path = os.path.join(self.prediction_directory, 'test.pkl') model.dump(pickle_path) new_model = Model(self.pipeline) new_model.load(pickle_path) # Test that there is at least one prediction resulting_ann = new_model.predict( 'To exclude the possibility that alterations in PSSD might be a consequence of changes in the volume of reference, we used a subset of the vibratome sections' ) self.assertIsInstance(resulting_ann, Annotations) self.assertTrue(resulting_ann)
def setUpClass(cls) -> None: if not have_metamap: return cls.metamap = MetaMap(metamap_path) cls.metamap.activate() # Create an unmetamapped copy of the sample dataset cls.temp_dataset_dir = tempfile.mkdtemp() for df in sample_dataset: shutil.copyfile( df.txt_path, os.path.join(cls.temp_dataset_dir, df.file_name + '.txt')) shutil.copyfile( df.ann_path, os.path.join(cls.temp_dataset_dir, df.file_name + '.ann')) cls.dataset = Dataset(cls.temp_dataset_dir)
def setup(args): """ Sets up dataset and pipeline/model since it gets used by every command. :param args: Argparse args object. :return dataset, model: The dataset and model objects created. """ dataset = Dataset(args.dataset) entities = list(dataset.get_labels()) if args.test_mode: dataset.data_limit = 1 if args.entities is not None: with open(args.entities, 'rb') as f: data = json.load(f) json_entities = data['entities'] if not set(json_entities) <= set(entities): raise ValueError( f"The following entities from the json file are not in the provided dataset: {set(json_entities) - set(entities)}" ) entities = json_entities if args.custom_pipeline is not None: logging.info( f"Using custom pipeline configured at {args.custom_pipeline}") # Construct a pipeline class (not an instance) based on the provided json path; # args.custom_pipeline is that path Pipeline = json_to_pipeline(args.custom_pipeline) else: # Parse the argument as a class name in module medacy.pipelines module = importlib.import_module("medacy.pipelines") Pipeline = getattr(module, args.pipeline) logging.info('Using %s', args.pipeline) pipeline = Pipeline(entities=entities, cuda_device=args.cuda, word_embeddings=args.word_embeddings, batch_size=args.batch_size, learning_rate=args.learning_rate, epochs=args.epochs, pretrained_model=args.pretrained_model, using_crf=args.using_crf) model = Model(pipeline) return dataset, model
def predict_annotation_evaluation(self, directory, training_dataset, preds_by_document, groundtruth_by_document, option): for data_file in training_dataset: logging.info("Predicting %s file: %s", option, data_file.file_name) with open(data_file.txt_path, 'r') as f: doc = self.pipeline.spacy_pipeline.make_doc(f.read()) if option == "groundtruth": preds = groundtruth_by_document[data_file.file_name] else: preds = preds_by_document[data_file.file_name] annotations = construct_annotations_from_tuples(doc, preds) annotations.to_ann( write_location=os.path.join(directory, data_file.file_name + ".ann")) return Dataset(directory)
def setup(args): """ Sets up dataset and pipeline/model since it gets used by every command. :param args: Argparse args object. :return dataset, model: The dataset and model objects created. """ dataset = Dataset(args.dataset) entities = list(dataset.get_labels()) pipeline = None if args.pipeline == 'spacy': logging.info('Using spacy model') model = SpacyModel(spacy_model_name=args.spacy_model, cuda=args.cuda) elif args.custom_pipeline is not None: # Construct a pipeline class (not an instance) based on the provided json path; # args.custom_pipeline is that path Pipeline = json_to_pipeline(args.custom_pipeline) # All parameters are part of the class, thus nothing needs to be set when instantiating pipeline = Pipeline() model = Model(pipeline) else: # Parse the argument as a class name in module medacy.pipelines module = importlib.import_module("medacy.pipelines") Pipeline = getattr(module, args.pipeline) logging.info('Using %s', args.pipeline) pipeline = Pipeline(entities=entities, cuda_device=args.cuda, word_embeddings=args.word_embeddings, batch_size=args.batch_size, learning_rate=args.learning_rate, epochs=args.epochs, pretrained_model=args.pretrained_model, using_crf=args.using_crf) model = Model(pipeline) return dataset, model
def test_multi_model(self): """Runs all tests for valid uses of MultiModel""" data = Dataset(self.data_dir) ents_1 = {'Endpoints', 'Species', 'DoseUnits'} ents_2 = {'TestArticle', 'Dose', 'Sex'} multimodel = MultiModel() # Test that *args works multimodel.add_model(self.sample_model_1_path, ClinicalPipeline, list(ents_1)) # Test that **kwargs works multimodel.add_model(self.sample_model_2_path, TestingPipeline, entities=list(ents_2)) # Test __len__ self.assertEqual(len(multimodel), 2) # Test that each model gets instantiated correctly for model, pipeline_class in zip(multimodel, [ClinicalPipeline, TestingPipeline]): current_pipeline = model.pipeline self.assertIsInstance(current_pipeline, pipeline_class) self.assertGreater(len(current_pipeline.entities), 0) # Test predict_directory resulting_data = multimodel.predict_directory(data.data_directory, self.temp_dir) labeled_items = resulting_data.get_labels() # Test that at least one label from each model is predicted self.assertTrue(any(e in ents_1 for e in labeled_items)) self.assertTrue(any(e in ents_2 for e in labeled_items)) # Test that all files get predicted for self.assertEqual(len(resulting_data), len(data))
def main(): parser = argparse.ArgumentParser( description="Calculate the lexical variation in a given dataset") parser.add_argument('dataset', help="Path to the dataset directory") parser.add_argument( '-f', '--format', help= "Format to print the table (options include grid, github, and latex)") args = parser.parse_args() data = Dataset(args.dataset) unique_mention_dict = calculate_unique_mentions(data) tag_counts = data.compute_counts() table = [['Tag', 'Unique Mentions', 'Total Mentions', 'Ratio']] for tag, mentions in unique_mention_dict.items(): table.append([ tag, len(mentions), tag_counts[tag], len(mentions) / tag_counts[tag] ]) print(tabulate.tabulate(table, headers="firstrow", tablefmt=args.format))
def test_init_with_data_limit(self): """Tests that initializing with a data limit works""" dataset = Dataset(self.dataset.data_directory, data_limit=1) self.assertEqual(len(dataset), 1)
def cross_validate(self, training_dataset=None, num_folds=5, prediction_directory=None, groundtruth_directory=None, asynchronous=False): """ Performs k-fold stratified cross-validation using our model and pipeline. If the training dataset, groundtruth_directory and prediction_directory are passed, intermediate predictions during cross validation are written to the directory `write_predictions`. This allows one to construct a confusion matrix or to compute the prediction ambiguity with the methods present in the Dataset class to support pipeline development without a designated evaluation set. :param training_dataset: Dataset that is being cross validated (optional) :param num_folds: number of folds to split training data into for cross validation :param prediction_directory: directory to write predictions of cross validation to or `True` for default predictions sub-directory. :param groundtruth_directory: directory to write the ground truth MedaCy evaluates on :param asynchronous: Boolean for whether the preprocessing should be done asynchronously. :return: Prints out performance metrics, if prediction_directory """ if num_folds <= 1: raise ValueError( "Number of folds for cross validation must be greater than 1, but is %s" % repr(num_folds)) if prediction_directory is not None and training_dataset is None: raise ValueError( "Cannot generate predictions during cross validation if training dataset is not given." " Please pass the training dataset in the 'training_dataset' parameter." ) if groundtruth_directory is not None and training_dataset is None: raise ValueError( "Cannot generate groundtruth during cross validation if training dataset is not given." " Please pass the training dataset in the 'training_dataset' parameter." ) pipeline_report = self.pipeline.get_report() self.preprocess(training_dataset, asynchronous) if not (self.X_data and self.y_data): raise RuntimeError( "Must have features and labels extracted for cross validation") tags = sorted(training_dataset.get_labels(as_list=True)) self.pipeline.entities = tags logging.info('Tagset: %s', tags) eval_stats = {} # Dict for storing mapping of sequences to their corresponding file groundtruth_by_document = { filename: [] for filename in {x[2] for x in self.X_data} } preds_by_document = { filename: [] for filename in {x[2] for x in self.X_data} } folds = create_folds(self.y_data, num_folds) for fold_num, fold_data in enumerate(folds, 1): train_indices, test_indices = fold_data fold_statistics = {} learner_name, learner = self.pipeline.get_learner() X_train = [self.X_data[index] for index in train_indices] y_train = [self.y_data[index] for index in train_indices] X_test = [self.X_data[index] for index in test_indices] y_test = [self.y_data[index] for index in test_indices] logging.info("Training Fold %i", fold_num) train_data = [x[0] for x in X_train] test_data = [x[0] for x in X_test] learner.fit(train_data, y_train) y_pred = learner.predict(test_data) if groundtruth_directory is not None: # Flattening nested structures into 2d lists document_indices = [] span_indices = [] for sequence in X_test: document_indices += [sequence[2]] * len(sequence[0]) span_indices += list(sequence[1]) groundtruth = [ element for sentence in y_test for element in sentence ] # Map the predicted sequences to their corresponding documents i = 0 while i < len(groundtruth): if groundtruth[i] == 'O': i += 1 continue entity = groundtruth[i] document = document_indices[i] first_start, first_end = span_indices[i] # Ensure that consecutive tokens with the same label are merged while i < len(groundtruth) - 1 and groundtruth[ i + 1] == entity: # If inside entity, keep incrementing i += 1 last_start, last_end = span_indices[i] groundtruth_by_document[document].append( (entity, first_start, last_end)) i += 1 if prediction_directory is not None: # Flattening nested structures into 2d lists document_indices = [] span_indices = [] for sequence in X_test: document_indices += [sequence[2]] * len(sequence[0]) span_indices += list(sequence[1]) predictions = [ element for sentence in y_pred for element in sentence ] # Map the predicted sequences to their corresponding documents i = 0 while i < len(predictions): if predictions[i] == 'O': i += 1 continue entity = predictions[i] document = document_indices[i] first_start, first_end = span_indices[i] # Ensure that consecutive tokens with the same label are merged while i < len(predictions) - 1 and predictions[ i + 1] == entity: # If inside entity, keep incrementing i += 1 last_start, last_end = span_indices[i] preds_by_document[document].append( (entity, first_start, last_end)) i += 1 # Write the metrics for this fold. for label in tags: fold_statistics[label] = { "recall": metrics.flat_recall_score(y_test, y_pred, average='weighted', labels=[label]), "precision": metrics.flat_precision_score(y_test, y_pred, average='weighted', labels=[label]), "f1": metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=[label]) } # add averages fold_statistics['system'] = { "recall": metrics.flat_recall_score(y_test, y_pred, average='weighted', labels=tags), "precision": metrics.flat_precision_score(y_test, y_pred, average='weighted', labels=tags), "f1": metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=tags) } table_data = [[ label, format(fold_statistics[label]['precision'], ".3f"), format(fold_statistics[label]['recall'], ".3f"), format(fold_statistics[label]['f1'], ".3f") ] for label in tags + ['system']] logging.info( '\n' + tabulate(table_data, headers=['Entity', 'Precision', 'Recall', 'F1'], tablefmt='orgtbl')) eval_stats[fold_num] = fold_statistics statistics_all_folds = {} for label in tags + ['system']: statistics_all_folds[label] = { 'precision_average': mean(eval_stats[fold][label]['precision'] for fold in eval_stats), 'precision_max': max(eval_stats[fold][label]['precision'] for fold in eval_stats), 'precision_min': min(eval_stats[fold][label]['precision'] for fold in eval_stats), 'recall_average': mean(eval_stats[fold][label]['recall'] for fold in eval_stats), 'recall_max': max(eval_stats[fold][label]['recall'] for fold in eval_stats), 'f1_average': mean(eval_stats[fold][label]['f1'] for fold in eval_stats), 'f1_max': max(eval_stats[fold][label]['f1'] for fold in eval_stats), 'f1_min': min(eval_stats[fold][label]['f1'] for fold in eval_stats), } entity_counts = training_dataset.compute_counts() table_data = [ [ f"{label} ({entity_counts[label]})", # Entity (Count) format(statistics_all_folds[label]['precision_average'], ".3f"), format(statistics_all_folds[label]['recall_average'], ".3f"), format(statistics_all_folds[label]['f1_average'], ".3f"), format(statistics_all_folds[label]['f1_min'], ".3f"), format(statistics_all_folds[label]['f1_max'], ".3f") ] for label in tags + ['system'] ] # Combine the pipeline report and the resulting data, then log it or print it (whichever ensures that it prints) output_str = '\n' + pipeline_report + '\n\n' + tabulate( table_data, headers=[ 'Entity (Count)', 'Precision', 'Recall', 'F1', 'F1_Min', 'F1_Max' ], tablefmt='orgtbl') if logging.root.level > logging.INFO: print(output_str) else: logging.info(output_str) if prediction_directory: prediction_directory = os.path.join( training_dataset.data_directory, "predictions") groundtruth_directory = os.path.join( training_dataset.data_directory, "groundtruth") # Write annotations generated from cross-validation self.create_annotation_directory(directory=prediction_directory, training_dataset=training_dataset, option="predictions") # Write medaCy ground truth generated from cross-validation self.create_annotation_directory(directory=groundtruth_directory, training_dataset=training_dataset, option="groundtruth") # Add predicted/known annotations to the folders containing groundtruth and predictions respectively self.predict_annotation_evaluation( directory=groundtruth_directory, training_dataset=training_dataset, preds_by_document=preds_by_document, groundtruth_by_document=groundtruth_by_document, option="groundtruth") self.predict_annotation_evaluation( directory=prediction_directory, training_dataset=training_dataset, preds_by_document=preds_by_document, groundtruth_by_document=groundtruth_by_document, option="predictions") return Dataset(prediction_directory) else: return statistics_all_folds
def predict(self, input_data, prediction_directory=None): """ Generates predictions over a string or a input_data utilizing the pipeline equipped to the instance. :param input_data: a string, Dataset, or directory path to predict over :param prediction_directory: The directory to write predictions if doing bulk prediction (default: */prediction* sub-directory of Dataset) :return: if input_data is a str, returns an Annotations of the predictions; if input_data is a Dataset or a valid directory path, returns a Dataset of the predictions. Note that if input_data is supposed to be a directory path but the directory is not found, it will be predicted over as a string. This can be prevented by validating inputs with os.path.isdir(). """ if self.model is None: raise RuntimeError( "Must fit or load a pickled model before predicting") if isinstance(input_data, str) and not os.path.isdir(input_data): doc = self.pipeline.spacy_pipeline.make_doc(input_data) doc.set_extension('file_name', default=None, force=True) doc._.file_name = 'STRING_INPUT' doc = self.pipeline(doc, predict=True) annotations = predict_document(self.model, doc, self.pipeline) return annotations if isinstance(input_data, Dataset): input_files = [d.txt_path for d in input_data] # Change input_data to point to the Dataset's directory path so that we can use it # to create the prediction directory input_data = input_data.data_directory elif os.path.isdir(input_data): input_files = [ os.path.join(input_data, f) for f in os.listdir(input_data) if f.endswith('.txt') ] else: raise ValueError( f"'input_data' must be a string (which can be a directory path) or a Dataset, but is {repr(input_data)}" ) if prediction_directory is None: prediction_directory = os.path.join(input_data, 'predictions') if os.path.isdir(prediction_directory): logging.warning("Overwriting existing predictions at %s", prediction_directory) else: os.mkdir(prediction_directory) for file_path in input_files: file_name = os.path.basename(file_path).strip('.txt') logging.info("Predicting file: %s", file_path) with open(file_path, 'r') as f: doc = self.pipeline.spacy_pipeline.make_doc(f.read()) doc.set_extension('file_name', default=None, force=True) doc._.file_name = file_name # run through the pipeline doc = self.pipeline(doc, predict=True) # Predict, creating a new Annotations object annotations = predict_document(self.model, doc, self.pipeline) logging.debug( "Writing to: %s", os.path.join(prediction_directory, file_name + ".ann")) annotations.to_ann( write_location=os.path.join(prediction_directory, file_name + ".ann")) # Copy the txt file so that the output will also be a Dataset copyfile(file_path, os.path.join(prediction_directory, file_name + ".txt")) return Dataset(prediction_directory)
import os from medacy.data.dataset import Dataset test_dir = os.path.dirname(__file__) sample_dataset = Dataset(os.path.join(test_dir, 'sample_dataset_1'))
def setUpClass(cls) -> None: cls.gold_dataset = sample_dataset cls.predicted_dataset = Dataset( str(sample_dataset.data_directory) + "_predictions") cls.maxDiff = None
import argparse import glob import os from collections import defaultdict from xml.etree import cElementTree from medacy.data.dataset import Dataset # Setup parser = argparse.ArgumentParser(description='n2c2: Evaluation script for Track 2') parser.add_argument('folder1', help='First data folder path (gold)') parser.add_argument('folder2', help='Second data folder path (system)') args = parser.parse_args() gold_dataset = Dataset(args.folder1) prediction_dataset = Dataset(args.folder2) global_tags = tuple(gold_dataset.get_labels() & prediction_dataset.get_labels()) class ClinicalCriteria(object): """Criteria in the Track 1 documents.""" def __init__(self, tid, value): """Init.""" self.tid = tid.strip().upper() self.ttype = self.tid self.value = value.lower().strip() def equals(self, other, mode='strict'): """Return whether the current criteria is equal to the one provided."""
def test_init_prediction(self): """Tests that the copy of the sample dataset with only text files is identified as being for prediction""" dataset = Dataset(self.prediction_directory) self.assertIsInstance(dataset, Dataset) self.assertFalse(dataset.is_training_directory)
def test_init(self): """Tests initializing Datasets from different directories to see that they create accurate DataFiles""" # Test both txt, ann, and metamapped test_dir_path = Path(self.dataset.data_directory) expected = [ DataFile(file_name="PMC1257590", txt_path=test_dir_path / "PMC1257590.txt", ann_path=test_dir_path / "PMC1257590.ann", metamapped_path=test_dir_path / "metamapped" / "PMC1257590.metamapped"), DataFile(file_name="PMC1314908", txt_path=test_dir_path / "PMC1314908.txt", ann_path=test_dir_path / "PMC1314908.ann", metamapped_path=test_dir_path / "metamapped" / "PMC1314908.metamapped"), DataFile(file_name="PMC1392236", txt_path=test_dir_path / "PMC1392236.txt", ann_path=test_dir_path / "PMC1392236.ann", metamapped_path=test_dir_path / "metamapped" / "PMC1392236.metamapped") ] expected.sort(key=lambda x: x.file_name) actual = list(self.dataset) self.assertListEqual(actual, expected) # Test txt only test_dir_path = Path(self.prediction_directory) expected = [ DataFile(file_name="PMC1257590", txt_path=test_dir_path / "PMC1257590.txt", ann_path=None, metamapped_path=None), DataFile(file_name="PMC1314908", txt_path=test_dir_path / "PMC1314908.txt", ann_path=None, metamapped_path=None), DataFile(file_name="PMC1392236", txt_path=test_dir_path / "PMC1392236.txt", ann_path=None, metamapped_path=None) ] expected.sort(key=lambda x: x.file_name) actual = list(Dataset(self.prediction_directory)) self.assertListEqual(actual, expected) # Test ann only test_dir_path = Path(self.ann_dir) expected = [ DataFile(file_name="PMC1257590", txt_path=None, ann_path=test_dir_path / "PMC1257590.ann", metamapped_path=None), DataFile( file_name="PMC1314908", txt_path=None, ann_path=test_dir_path / "PMC1314908.ann", metamapped_path=None, ), DataFile(file_name="PMC1392236", txt_path=None, ann_path=test_dir_path / "PMC1392236.ann", metamapped_path=None) ] expected.sort(key=lambda x: x.file_name) actual = list(Dataset(self.ann_dir)) self.assertListEqual(actual, expected)