def test_get_classes_from_filter(self): """Test the get_classes_from_filter() method.""" filter_genera = {'class': 'genus'} filter_sections = {'class': 'section'} filter_mexi_section = { 'class': 'section', 'where': { 'genus': 'Mexipedium' } } filter_trigo = { 'class': 'species', 'where': { 'genus': 'Cypripedium', 'section': 'Trigonopedia' } } with db.session_scope(META_FILE) as (session, metadata): classes = db.get_classes_from_filter(session, metadata, filter_genera) self.assertEqual( classes, set([ 'Cypripedium', 'Mexipedium', 'Paphiopedilum', 'Selenipedium', 'Phragmipedium' ])) classes = db.get_classes_from_filter(session, metadata, filter_sections) self.assertEqual( classes, set([ 'Obtusipetala', 'Arietinum', 'Trigonopedia', 'Brachypetalum', 'Micropetalum', None ])) classes = db.get_classes_from_filter(session, metadata, filter_mexi_section) self.assertEqual(classes, set([None])) classes = db.get_classes_from_filter(session, metadata, filter_trigo) self.assertEqual(classes, set(['sichuanense', 'fargesii']))
def batch_train(self, data_dir, output_dir): """Batch train neural networks. Training data is obtained from the directory `data_dir` and the neural networks are saved to the directory `output_dir`. Which training data to train on is set in the classification hierarchy of the configurations. """ session, metadata = db.get_session_or_error() # Must not be loaded in the constructor, in case set_photo_count_min() # is used. self._load_taxon_hierarchy() # Get the name of each level in the classification hierarchy. levels = [l.name for l in self.class_hr] # Train an ANN for each path in the classification hierarchy. for filter_ in classification_hierarchy_filters(levels, self.taxon_hr): level = levels.index(filter_.get('class')) train_file = os.path.join(data_dir, self.class_hr[level].train_file) ann_file = os.path.join(output_dir, self.class_hr[level].ann_file) if 'ann' not in self.class_hr[level]: config = None else: config = self.class_hr[level].ann # Replace any placeholders in the paths. where = filter_.get('where', {}) for key, val in where.items(): val = val if val is not None else '_' train_file = train_file.replace("__%s__" % key, val) ann_file = ann_file.replace("__%s__" % key, val) # Get the classification categories from the database. classes = db.get_classes_from_filter(session, metadata, filter_) assert len(classes) > 0, \ "No classes found for filter `%s`" % filter_ # Skip train data export if there is only one class for this filter. if not len(classes) > 1: logging.debug("Only one class for this filter. Skipping " \ "training of %s" % ann_file) continue # Train the ANN. logging.info("Training network `%s` with training data " \ "from `%s` ..." % (ann_file, train_file)) try: self.train(train_file, ann_file, config) except FileExistsError as e: # Don't train if the file already exists. logging.warning("Skipping: %s" % e)
def test_trainer_ad(self): """Test the `classify` subcommands.""" filter_ = self.config.classification.filter.as_dict() image = os.path.join(IMAGE_DIR, "Cypripedium/Arietinum/plectrochilum/14990382409.jpg") with db.session_scope(META_FILE) as (session, metadata): classes = db.get_classes_from_filter(session, metadata, filter_) if not classes: raise ValueError("No classes found for filter `%s`" % filter_) codewords = get_codewords(classes) classifier = ImageClassifier(self.config) codeword = classifier.classify_image(image, self.ann_file, self.config) classification = get_classification(codewords, codeword, 0.001) class_ = [class_ for mse,class_ in classification] print "Image is classified as {0}".format(", ".join(class_))
def test_with_hierarchy(self, test_data_dir, ann_dir, max_error=0.001): """Test each ANN in a classification hierarchy and export results. Returns a 2-tuple ``(correct,total)``. """ session, metadata = db.get_session_or_error() logging.info("Testing the neural networks hierarchy...") self.classifications = {} self.classifications_expected = {} # Get the taxonomic hierarchy from the database. self.taxon_hr = db.get_taxon_hierarchy(session, metadata) # Get the classification hierarchy from the configurations. try: self.class_hr = self.config.classification.hierarchy except: raise ConfigurationError("classification hierarchy not set") # Get the name of each level in the classification hierarchy. levels = [l.name for l in self.class_hr] # Get the prefix for the classification columns. try: dependent_prefix = self.config.data.dependent_prefix except: dependent_prefix = OUTPUT_PREFIX # Get the expected and recognized classification for each sample in # the test data. for filter_ in classification_hierarchy_filters(levels, self.taxon_hr): logging.info("Classifying on %s" % readable_filter(filter_)) level_name = filter_.get('class') level_n = levels.index(level_name) level = self.class_hr[level_n] test_file = os.path.join(test_data_dir, level.test_file) ann_file = os.path.join(ann_dir, level.ann_file) # Set the maximum error for classification. try: max_error = level.max_error except: pass # Replace any placeholders in the paths. where = filter_.get('where', {}) for key, val in where.items(): val = val if val is not None else '_' test_file = test_file.replace("__%s__" % key, val) ann_file = ann_file.replace("__%s__" % key, val) # Get the class names for this filter. classes = db.get_classes_from_filter(session, metadata, filter_) assert len(classes) > 0, \ "No classes found for filter `%s`" % filter_ # Get the codeword for each class. codewords = get_codewords(classes) # Load the ANN. if len(classes) > 1: ann = libfann.neural_net() ann.create_from_file(str(ann_file)) # Load the test data. test_data = TrainData() test_data.read_from_file(test_file, dependent_prefix) # Test each sample in the test data. for label, input_, output in test_data: assert len(codewords) == len(output), \ "Codeword size mismatch. Codeword has {0} bits, but the " \ "training data has {1} output bits.".\ format(len(codewords), len(output)) # Obtain the photo ID from the label. if not label: raise ValueError("Test sample is missing a label with " \ "photo ID") try: photo_id = self.re_photo_id.search(label).group(1) photo_id = int(photo_id) except: raise RuntimeError("Failed to obtain the photo ID from " \ "the sample label") # Skip classification if there is only one class for this # filter. if not len(classes) > 1: logging.debug("Not enough classes for filter. Skipping " \ "testing of %s" % ann_file) self.classifications[photo_id][level_name] = [''] self.classifications_expected[photo_id][level_name] = [''] continue # Set the expected class. class_expected = get_classification(codewords, output, max_error) class_expected = [class_ for mse,class_ in class_expected] assert len(class_expected) == 1, \ "Class codewords must have one positive bit, found {0}".\ format(len(class_expected)) # Get the recognized class. codeword = ann.run(input_) class_ann = get_classification(codewords, codeword, max_error) class_ann = [class_ for mse,class_ in class_ann] # Save the classification at each level. if level_n == 0: self.classifications[photo_id] = {} self.classifications_expected[photo_id] = {} self.classifications[photo_id][level_name] = class_ann self.classifications_expected[photo_id][level_name] = class_expected ann.destroy() return self.get_correct_count()
def export_results(self, filename, filter_, error=0.01): """Export the classification results to a TSV file. Export the test results to a tab separated file `filename`. The class name for a codeword is obtained from the database `db_path`, using the classification filter `filter_`. A bit in a codeword is considered on if the mean square error for a bit is less or equal to `error`. """ session, metadata = db.get_session_or_error() if self.test_data is None: raise RuntimeError("Test data is not set") # Get the classification categories from the database. classes = db.get_classes_from_filter(session, metadata, filter_) assert len(classes) > 0, \ "No classes found for filter `%s`" % filter_ # Get the codeword for each class. codewords = get_codewords(classes) # Write results to file. with open(filename, 'w') as fh: # Write the header. fh.write( "%s\n" % "\t".join(['ID','Class','Classification','Match']) ) total = 0 correct = 0 for label, input, output in self.test_data: total += 1 row = [] if label: row.append(label) else: row.append("") if len(codewords) != len(output): raise ValueError("Codeword length ({0}) does not " \ "match output length ({1}). Is the classification " \ "filter correct?".format(len(codewords), len(output)) ) class_expected = get_classification(codewords, output, error) class_expected = [class_ for mse,class_ in class_expected] assert len(class_expected) == 1, \ "The codeword for a class can only have one positive value" row.append(class_expected[0]) codeword = self.ann.run(input) class_ann = get_classification(codewords, codeword, error) class_ann = [class_ for mse,class_ in class_ann] row.append(", ".join(class_ann)) # Assume a match if the first items of the classifications match. if len(class_ann) > 0 and class_ann[0] == class_expected[0]: row.append("+") correct += 1 else: row.append("-") fh.write( "%s\n" % "\t".join(row) ) # Calculate fraction correctly classified. fraction = float(correct) / total # Write correctly classified fraction. fh.write( "%s\n" % "\t".join(['','','',"%.3f" % fraction]) ) print "Correctly classified: %.1f%%" % (fraction*100) print "Testing results written to %s" % filename
def export(self, filename, filter_, config=None): """Write the training data to `filename`. Images to be processed are obtained from the database. Which images are obtained and with which classes is set by the filter `filter_`. Image fingerprints are obtained from cache, which must have been created for configuration `config` or `self.config`. """ session, metadata = db.get_session_or_error() if not conf.force_overwrite and os.path.isfile(filename): raise FileExistsError(filename) # Get the classification categories from the database. classes = db.get_classes_from_filter(session, metadata, filter_) assert len(classes) > 0, \ "No classes found for filter `%s`" % filter_ # Get the photos and corresponding classification using the filter. images = db.get_filtered_photos_with_taxon(session, metadata, filter_) images = images.all() if not images: logging.info("No images found for the filter `%s`", filter_) return if self.get_photo_count_min(): assert len(images) >= self.get_photo_count_min(), \ "Expected to find at least photo_count_min={0} photos, found " \ "{1}".format(self.get_photo_count_min(), len(images)) # Calculate the number of images that will be processed, taking into # account the subset. photo_ids = np.array([photo.id for photo, _ in images]) if self.subset: n_images = len(np.intersect1d(list(photo_ids), list(self.subset))) else: n_images = len(images) logging.info("Going to process %d photos...", n_images) # Make a codeword for each class. codewords = get_codewords(classes) # Construct the header. header_data, header_out = self.__make_header(len(classes)) header = ["ID"] + header_data + header_out # Get the configurations. if not config: config = self.config # Load the fingerprint cache. self.cache.load_cache(self.cache_path, config) # Generate the training data. with open(filename, 'w') as fh: # Write the header. fh.write( "%s\n" % "\t".join(header) ) # Set the training data. training_data = TrainData(len(header_data), len(classes)) for photo, class_ in images: # Only export the subset if an export subset is set. if self.subset and photo.id not in self.subset: continue logging.info("Processing `%s` of class `%s`...", photo.path, class_) # Get phenotype for this image from the cache. phenotype = self.cache.get_phenotype(photo.md5sum) assert len(phenotype) == len(header_data), \ "Fingerprint size mismatch. According to the header " \ "there are {0} data columns, but the fingerprint has " \ "{1}".format(len(header_data), len(phenotype)) training_data.append(phenotype, codewords[class_], label=photo.id) training_data.finalize() if not training_data: raise ValueError("Training data cannot be empty") # Round feature data. training_data.round_input(6) # Write data rows. for photo_id, input_, output in training_data: row = [str(photo_id)] row.extend(input_.astype(str)) row.extend(output.astype(str)) fh.write("%s\n" % "\t".join(row)) logging.info("Training data written to %s", filename)
def export(self, filename, filter_, config=None, codebook_file=None): """Write the training data to `filename`. Images to be processed are obtained from the database. Which images are obtained and with which classes is set by the filter `filter_`. Image fingerprints are obtained from cache, which must have been created for configuration `config` or `self.config`. """ session, metadata = db.get_session_or_error() if not conf.force_overwrite and os.path.isfile(filename): raise FileExistsError(filename) # Get the classification categories from the database. classes = db.get_classes_from_filter(session, metadata, filter_) assert len(classes) > 0, \ "No classes found for filter `%s`" % filter_ # Get the photos and corresponding classification using the filter. images = db.get_filtered_photos_with_taxon(session, metadata, filter_) images = images.all() if not images: logging.info("No images found for the filter `%s`", filter_) return if self.get_photo_count_min(): assert len(images) >= self.get_photo_count_min(), \ "Expected to find at least photo_count_min={0} photos, found " \ "{1}".format(self.get_photo_count_min(), len(images)) # Calculate the number of images that will be processed, taking into # account the subset. photo_ids = np.array([photo.id for photo, _ in images]) if self.subset: n_images = len(np.intersect1d(list(photo_ids), list(self.subset))) else: n_images = len(images) logging.info("Going to process %d photos...", n_images) # Make a codeword for each class. codewords = get_codewords(classes) # Construct the header. header_data, header_out = self.__make_header(len(classes)) header = ["ID"] + header_data + header_out # Get the configurations. if not config: config = self.config # Load the fingerprint cache. self.cache.load_cache(self.cache_path, config) # Check if the BagOfWords alogrithm needs to be applied. use_bow = getattr(self.config.features['surf'], 'bow_clusters', False) if use_bow and codebook_file == None: codebook = self.__make_codebook(images, filename) elif use_bow: with open(codebook_file, "rb") as cb: codebook = load(cb) # Generate the training data. with open(filename, 'w') as fh: # Write the header. fh.write("%s\n" % "\t".join(header)) # Set the training data. training_data = TrainData(len(header_data), len(classes)) for photo, class_ in images: # Only export the subset if an export subset is set. if self.subset and photo.id not in self.subset: continue logging.info("Processing `%s` of class `%s`...", photo.path, class_) # Get phenotype for this image from the cache. phenotype = self.cache.get_phenotype(photo.md5sum) # If the BagOfWords algorithm is applied, # convert phenotype to BOW-code. if use_bow: phenotype = get_bowcode_from_surf_features( phenotype, codebook) assert len(phenotype) == len(header_data), \ "Fingerprint size mismatch. According to the header " \ "there are {0} data columns, but the fingerprint has " \ "{1}".format(len(header_data), len(phenotype)) training_data.append(phenotype, codewords[class_], label=photo.id) training_data.finalize() if not training_data: raise ValueError("Training data cannot be empty") # Round feature data only if BOW is not applied. if not use_bow: training_data.round_input(6) # Write data rows. for photo_id, input_, output in training_data: row = [str(photo_id)] row.extend(input_.astype(str)) row.extend(output.astype(str)) fh.write("%s\n" % "\t".join(row)) logging.info("Training data written to %s", filename)
def test_with_hierarchy(self, test_data_dir, ann_dir, max_error=0.001): """Test each ANN in a classification hierarchy and export results. Returns a 2-tuple ``(correct,total)``. """ session, metadata = db.get_session_or_error() logging.info("Testing the neural networks hierarchy...") self.classifications = {} self.classifications_expected = {} # Get the taxonomic hierarchy from the database. self.taxon_hr = db.get_taxon_hierarchy(session, metadata) # Get the classification hierarchy from the configurations. try: self.class_hr = self.config.classification.hierarchy except: raise ConfigurationError("classification hierarchy not set") # Get the name of each level in the classification hierarchy. levels = [l.name for l in self.class_hr] # Get the prefix for the classification columns. try: dependent_prefix = self.config.data.dependent_prefix except: dependent_prefix = OUTPUT_PREFIX # Get the expected and recognized classification for each sample in # the test data. for filter_ in classification_hierarchy_filters(levels, self.taxon_hr): logging.info("Classifying on %s" % readable_filter(filter_)) level_name = filter_.get('class') level_n = levels.index(level_name) level = self.class_hr[level_n] test_file = os.path.join(test_data_dir, level.test_file) ann_file = os.path.join(ann_dir, level.ann_file) # Set the maximum error for classification. try: max_error = level.max_error except: pass # Replace any placeholders in the paths. where = filter_.get('where', {}) for key, val in where.items(): val = val if val is not None else '_' test_file = test_file.replace("__%s__" % key, val) ann_file = ann_file.replace("__%s__" % key, val) # Get the class names for this filter. classes = db.get_classes_from_filter(session, metadata, filter_) assert len(classes) > 0, \ "No classes found for filter `%s`" % filter_ # Get the codeword for each class. codewords = get_codewords(classes) # Load the ANN. ann = libfann.neural_net() if len(classes) > 1: ann.create_from_file(str(ann_file)) # Load the test data. test_data = TrainData() test_data.read_from_file(test_file, dependent_prefix) # Test each sample in the test data. for label, input_, output in test_data: assert len(codewords) == len(output), \ "Codeword size mismatch. Codeword has {0} bits, but the " \ "training data has {1} output bits.".\ format(len(codewords), len(output)) # Obtain the photo ID from the label. if not label: raise ValueError("Test sample is missing a label with " \ "photo ID") try: photo_id = self.re_photo_id.search(label).group(1) photo_id = int(photo_id) except: raise RuntimeError("Failed to obtain the photo ID from " \ "the sample label") # Save the classification at each level. if level_n == 0: self.classifications[photo_id] = {} self.classifications_expected[photo_id] = {} # Skip classification if there is only one class for this # filter. if not len(classes) > 1: logging.debug("Not enough classes for filter. Skipping " \ "testing of %s" % ann_file) self.classifications[photo_id][level_name] = [''] self.classifications_expected[photo_id][level_name] = [''] continue # Set the expected class. class_expected = get_classification(codewords, output, max_error) class_expected = [class_ for mse,class_ in class_expected] assert len(class_expected) == 1, \ "Class codewords must have one positive bit, found {0}".\ format(len(class_expected)) # Get the recognized class. codeword = ann.run(input_) class_ann = get_classification(codewords, codeword, max_error) class_ann = [class_ for mse,class_ in class_ann] # Save the classification at each level. self.classifications[photo_id][level_name] = class_ann self.classifications_expected[photo_id][level_name] = class_expected ann.destroy() return self.get_correct_count()