def predictionPooling(p): #You can test different prediction pooling strategies here if p.ndim == 2: try: # Median filtered pooling for monophonic recordings row_median = np.median(p, axis=1, keepdims=True) p[p < row_median * 1.5] = 0.0 p_pool = np.mean((p * 2)**2, axis=0) p_pool -= p_pool.min() if p_pool.max() > 1.0: p_pool /= p_pool.max() # Mean exponential pooling for monophonic recordings #p_pool = np.mean((p * 2) ** 2, axis=0) #p_pool[p_pool > 1.0] = 1.0 # Simple average pooling #p_pool = np.mean(p, axis=0) #p_pool = sigmoid(p_pool) except: p_pool = cfg.getRandomState().normal(0.0, 1.0, (p.shape[1])) else: p_pool = p return p_pool
def build_pi_model(): log.i('BUILDING RASBPERRY PI MODEL...') # Random Seed lasagne_random.set_rng(cfg.getRandomState()) # Input layer for images net = l.InputLayer((None, cfg.IM_DIM, cfg.IM_SIZE[1], cfg.IM_SIZE[0])) # Convolutinal layer groups for i in range(len(cfg.FILTERS)): # 3x3 Convolution + Stride net = batch_norm( l.Conv2DLayer(net, num_filters=cfg.FILTERS[i], filter_size=cfg.KERNEL_SIZES[i], num_groups=cfg.NUM_OF_GROUPS[i], pad='same', stride=2, W=initialization(cfg.NONLINEARITY), nonlinearity=nonlinearity(cfg.NONLINEARITY))) log.i(('\tGROUP', i + 1, 'OUT SHAPE:', l.get_output_shape(net))) # Fully connected layers + dropout layers net = l.DenseLayer(net, cfg.DENSE_UNITS, nonlinearity=nonlinearity(cfg.NONLINEARITY), W=initialization(cfg.NONLINEARITY)) net = l.DropoutLayer(net, p=cfg.DROPOUT) net = l.DenseLayer(net, cfg.DENSE_UNITS, nonlinearity=nonlinearity(cfg.NONLINEARITY), W=initialization(cfg.NONLINEARITY)) net = l.DropoutLayer(net, p=cfg.DROPOUT) # Classification Layer (Softmax) net = l.DenseLayer(net, len(cfg.CLASSES), nonlinearity=nonlinearity('softmax'), W=initialization('softmax')) log.i(("\tFINAL NET OUT SHAPE:", l.get_output_shape(net))) log.i("...DONE!") # Model stats log.i(("MODEL HAS", (sum(hasattr(layer, 'W') for layer in l.get_all_layers(net))), "WEIGHTED LAYERS")) log.i(("MODEL HAS", l.count_params(net), "PARAMS")) return net
def getSpecBatches(split): # Random Seed random = cfg.getRandomState() # Make predictions for every testfile for t in split: # Spec batch spec_batch = [] # Get specs for file for spec in audio.specsFromFile(t[0], cfg.SAMPLE_RATE, cfg.SPEC_LENGTH, cfg.SPEC_OVERLAP, cfg.SPEC_MINLEN, shape=(cfg.IM_SIZE[1], cfg.IM_SIZE[0]), fmin=cfg.SPEC_FMIN, fmax=cfg.SPEC_FMAX, spec_type=cfg.SPEC_TYPE): # Resize spec spec = image.resize(spec, cfg.IM_SIZE[0], cfg.IM_SIZE[1], mode=cfg.RESIZE_MODE) # Normalize spec spec = image.normalize(spec, cfg.ZERO_CENTERED_NORMALIZATION) # Prepare as input spec = image.prepare(spec) # Add to batch if len(spec_batch) > 0: spec_batch = np.vstack((spec_batch, spec)) else: spec_batch = spec # Batch too large? if spec_batch.shape[0] >= cfg.MAX_SPECS_PER_FILE: break # No specs? if len(spec_batch) == 0: spec = random.normal(0.0, 1.0, (cfg.IM_SIZE[1], cfg.IM_SIZE[0])) spec_batch = image.prepare(spec) # Shuffle spec batch spec_batch = shuffle(spec_batch, random_state=random) # yield batch, labels and filename yield spec_batch[:cfg.MAX_SPECS_PER_FILE], t[1], t[0].split(os.sep)[-1]
def sortDataset(mdata): print 'PARSING CLASSES...' # Parse classes for c in mdata: print '\t', c # Determine size of val split (10% but at least 1 file) val = max(1, len(mdata[c]) * 0.1) # Shuffle list of files mdata[c] = shuffle(mdata[c], random_state=cfg.getRandomState()) # Parse list of files and copy to destination for f in mdata[c]: # Get class name (we use the sci-name which makes it easier to evaluate with background species) # The submission format uses class id only - so we have to figure that out later cname = f['sci-name'] # Make folders m_path = os.path.join(cfg.TRAINSET_PATH, 'metadata') if not os.path.exists(m_path): os.makedirs(m_path) t_path = os.path.join(cfg.TRAINSET_PATH, 'train', cname) if not os.path.exists(t_path): os.makedirs(t_path) v_path = os.path.join(cfg.TRAINSET_PATH, 'val', cname) if not os.path.exists(v_path): os.makedirs(v_path) # Copy files with open( os.path.join(m_path, f['filename'].rsplit('.')[0] + '.json'), 'w') as mfile: json.dump(f, mfile) if mdata[c].index(f) < val: copyfile(os.path.join(cfg.TRAINSET_PATH, 'wav', f['filename']), os.path.join(v_path, f['filename'])) else: copyfile(os.path.join(cfg.TRAINSET_PATH, 'wav', f['filename']), os.path.join(t_path, f['filename'])) print '...DONE!'
def parseTestSet(): # Random Seed random = cfg.getRandomState() # Status log.i('PARSING TEST SET...', new_line=False) TEST = [] # List of test files fnames = [] for path, dirs, files in os.walk(cfg.TESTSET_PATH): if path.split(os.sep)[-1] in cfg.CLASSES: scnt = 0 for f in files: fnames.append(os.path.join(path, f)) scnt += 1 if scnt >= cfg.MAX_TEST_SAMPLES_PER_CLASS and cfg.MAX_TEST_SAMPLES_PER_CLASS > 0: break fnames = sorted(shuffle(fnames, random_state=random)[:cfg.MAX_TEST_FILES]) # Get ground truth from metadata for f in fnames: # Metadata path m_path = os.path.join(cfg.METADATA_PATH, f.split(os.sep)[-1].split('.')[0] + '.json') # Load JSON with open(m_path) as jfile: data = json.load(jfile) # Get Species (+ background species) # Only species present in the trained classes are relevant for the metric # Still, we are adding anything we have right now and sort it out later if cfg.TEST_WITH_BG_SPECIES: bg = data['background'] else: bg = [] species = [data['sci-name']] + bg # Add data to test set TEST.append((f, species)) # Status log.i('DONE!') log.i(('TEST FILES:', len(TEST))) return TEST
def parseDataset(): # Random Seed random = cfg.getRandomState() # We use subfolders as class labels classes = [folder for folder in sorted(os.listdir(cfg.DATASET_PATH)) if folder in cfg.CLASS_WHITELIST or len(cfg.CLASS_WHITELIST) == 0] if not cfg.SORT_CLASSES_ALPHABETICALLY: classes = shuffle(classes, random_state=random) classes = classes[:cfg.MAX_CLASSES] # Now we enlist all image paths for each class images = [] tclasses = [] sample_count = {} for c in classes: c_images = [os.path.join(cfg.DATASET_PATH, c, path) for path in shuffle(os.listdir(os.path.join(cfg.DATASET_PATH, c)), random_state=random) if isValidClass(c, path)][:cfg.MAX_SAMPLES_PER_CLASS] sample_count[c] = len(c_images) images += c_images # Do we want to correct class imbalance? # This will affect validation scores as we use some samples in TRAIN and VAL while sample_count[c] < cfg.MIN_SAMPLES_PER_CLASS: images += [c_images[random.randint(0, len(c_images))]] sample_count[c] += 1 # Add labels to image paths for i in range(len(images)): path = images[i] label = images[i].split(os.sep)[-2] images[i] = (path, label) # Shuffle image paths images = shuffle(images, random_state=random) # Validation split vsplit = int(len(images) * cfg.VAL_SPLIT) train = images[:-vsplit] val = images[-vsplit:] # Show some stats log.i(("CLASSES:", len(classes))) log.i(( "CLASS LABELS:", sorted(sample_count.items(), key=operator.itemgetter(1)))) log.i(("TRAINING IMAGES:", len(train))) log.i(("VALIDATION IMAGES:", len(val))) return classes, train, val
# Author: Stefan Kahl, 2018, Chemnitz University of Technology import os import time import numpy as np import cv2 from sklearn.utils import shuffle import config as cfg from utils import audio from utils import log ######################## CONFIG ######################### RANDOM = cfg.getRandomState() ######################### SPEC ########################## def getSpecs(path): specs = [] noise = [] # Get mel-specs for file for spec in audio.specsFromFile(path, rate=cfg.SAMPLE_RATE, seconds=cfg.SPEC_LENGTH, overlap=cfg.SPEC_OVERLAP, minlen=cfg.SPEC_MINLEN, fmin=cfg.SPEC_FMIN,
def build_baseline_model(): log.i('BUILDING BASELINE MODEL...') # Random Seed lasagne_random.set_rng(cfg.getRandomState()) # Input layer for images net = l.InputLayer((None, cfg.IM_DIM, cfg.IM_SIZE[1], cfg.IM_SIZE[0])) # Stride size (as an alternative to max pooling) if cfg.MAX_POOLING: s = 1 else: s = 2 # Convolutinal layer groups for i in range(len(cfg.FILTERS)): # 3x3 Convolution + Stride net = batch_norm( l.Conv2DLayer(net, num_filters=cfg.FILTERS[i], filter_size=cfg.KERNEL_SIZES[i], num_groups=cfg.NUM_OF_GROUPS[i], pad='same', stride=s, W=initialization(cfg.NONLINEARITY), nonlinearity=nonlinearity(cfg.NONLINEARITY))) # Pooling layer if cfg.MAX_POOLING: net = l.MaxPool2DLayer(net, pool_size=2) # Dropout Layer (we support different types of dropout) if cfg.DROPOUT_TYPE == 'channels' and cfg.DROPOUT > 0.0: net = l.dropout_channels(net, p=cfg.DROPOUT) elif cfg.DROPOUT_TYPE == 'location' and cfg.DROPOUT > 0.0: net = l.dropout_location(net, p=cfg.DROPOUT) elif cfg.DROPOUT > 0.0: net = l.DropoutLayer(net, p=cfg.DROPOUT) log.i(('\tGROUP', i + 1, 'OUT SHAPE:', l.get_output_shape(net))) # Final 1x1 Convolution net = batch_norm( l.Conv2DLayer(net, num_filters=cfg.FILTERS[i] * 2, filter_size=1, W=initialization('identity'), nonlinearity=nonlinearity('identity'))) log.i(('\tFINAL CONV OUT SHAPE:', l.get_output_shape(net))) # Global Pooling layer (default mode = average) net = l.GlobalPoolLayer(net) log.i(("\tFINAL POOLING SHAPE:", l.get_output_shape(net))) # Classification Layer (Softmax) net = l.DenseLayer(net, len(cfg.CLASSES), nonlinearity=nonlinearity('softmax'), W=initialization('softmax')) log.i(("\tFINAL NET OUT SHAPE:", l.get_output_shape(net))) log.i("...DONE!") # Model stats log.i(("MODEL HAS", (sum(hasattr(layer, 'W') for layer in l.get_all_layers(net))), "WEIGHTED LAYERS")) log.i(("MODEL HAS", l.count_params(net), "PARAMS")) return net
def build_resnet_model(): log.i('BUILDING RESNET MODEL...') # Random Seed lasagne_random.set_rng(cfg.getRandomState()) # Input layer for images net = l.InputLayer((None, cfg.IM_DIM, cfg.IM_SIZE[1], cfg.IM_SIZE[0])) # First Convolution net = l.Conv2DLayer(net, num_filters=cfg.FILTERS[0], filter_size=cfg.KERNEL_SIZES[0], pad='same', W=initialization(cfg.NONLINEARITY), nonlinearity=None) log.i(("\tFIRST CONV OUT SHAPE:", l.get_output_shape(net), "LAYER:", len(l.get_all_layers(net)) - 1)) # Residual Stacks for i in range(0, len(cfg.FILTERS)): net = resblock(net, filters=cfg.FILTERS[i] * cfg.RESNET_K, kernel_size=cfg.KERNEL_SIZES[i], stride=2, num_groups=cfg.NUM_OF_GROUPS[i]) for _ in range(1, cfg.RESNET_N): net = resblock(net, filters=cfg.FILTERS[i] * cfg.RESNET_K, kernel_size=cfg.KERNEL_SIZES[i], num_groups=cfg.NUM_OF_GROUPS[i], preactivated=False) log.i(("\tRES STACK", i + 1, "OUT SHAPE:", l.get_output_shape(net), "LAYER:", len(l.get_all_layers(net)) - 1)) # Post Activation net = batch_norm(net) net = l.NonlinearityLayer(net, nonlinearity=nonlinearity(cfg.NONLINEARITY)) # Pooling net = l.GlobalPoolLayer(net) log.i(("\tFINAL POOLING SHAPE:", l.get_output_shape(net), "LAYER:", len(l.get_all_layers(net)) - 1)) # Classification Layer net = l.DenseLayer(net, len(cfg.CLASSES), nonlinearity=nonlinearity('identity'), W=initialization('identity')) net = l.NonlinearityLayer(net, nonlinearity=nonlinearity('softmax')) log.i(("\tFINAL NET OUT SHAPE:", l.get_output_shape(net), "LAYER:", len(l.get_all_layers(net)))) log.i("...DONE!") # Model stats log.i(("MODEL HAS", (sum(hasattr(layer, 'W') for layer in l.get_all_layers(net))), "WEIGHTED LAYERS")) log.i(("MODEL HAS", l.count_params(net), "PARAMS")) return net
def getSpecBatches(split): # Random Seed random = cfg.getRandomState() # Make predictions for every testfile for t in split: # Spec batch spec_batch = [] # Keep track of timestamps pred_start = 0 # Get specs for file for spec in audio.specsFromFile(t[0], cfg.SAMPLE_RATE, cfg.SPEC_LENGTH, cfg.SPEC_OVERLAP, cfg.SPEC_MINLEN, shape=(cfg.IM_SIZE[1], cfg.IM_SIZE[0]), fmin=cfg.SPEC_FMIN, fmax=cfg.SPEC_FMAX): # Resize spec spec = image.resize(spec, cfg.IM_SIZE[0], cfg.IM_SIZE[1], mode=cfg.RESIZE_MODE) # Normalize spec spec = image.normalize(spec, cfg.ZERO_CENTERED_NORMALIZATION) # Prepare as input spec = image.prepare(spec) # Add to batch if len(spec_batch) > 0: spec_batch = np.vstack((spec_batch, spec)) else: spec_batch = spec # Batch too large? if spec_batch.shape[0] >= cfg.MAX_SPECS_PER_FILE: break # Do we have enough specs for a prediction? if len(spec_batch) >= cfg.SPECS_PER_PREDICTION: # Calculate next timestamp pred_end = pred_start + cfg.SPEC_LENGTH + ( (len(spec_batch) - 1) * (cfg.SPEC_LENGTH - cfg.SPEC_OVERLAP)) # Store prediction ts = getTimestamp(int(pred_start), int(pred_end)) # Advance to next timestamp pred_start = pred_end - cfg.SPEC_OVERLAP yield spec_batch, t[1], ts, t[0].split(os.sep)[-1] # Spec batch spec_batch = []
def resetRandomState(): global RANDOM RANDOM = cfg.getRandomState()
def train(NET, TRAIN, VAL): # Random Seed random = cfg.getRandomState() image.resetRandomState() # Load pretrained model if cfg.PRETRAINED_MODEL_NAME: snapshot = io.loadModel(cfg.PRETRAINED_MODEL_NAME) NET = io.loadParams(NET, snapshot['params']) # Load teacher models teach_funcs = [] if len(cfg.TEACHER) > 0: for t in cfg.TEACHER: snapshot = io.loadModel(t) TEACHER = snapshot['net'] teach_funcs.append(birdnet.test_function(TEACHER, hasTargets=False)) # Compile Theano functions train_net = birdnet.train_function(NET) test_net = birdnet.test_function(NET) # Status log.i("START TRAINING...") # Train for some epochs... for epoch in range(cfg.EPOCH_START, cfg.EPOCHS + 1): try: # Stop? if cfg.DO_BREAK: break # Clear stats for every epoch stats.clearStats() stats.setValue('sample_count', len(TRAIN) + len(VAL)) # Start timer stats.tic('epoch_time') # Shuffle dataset (this way we get "new" batches every epoch) TRAIN = shuffle(TRAIN, random_state=random) # Iterate over TRAIN batches of images for image_batch, target_batch in bg.nextBatch(TRAIN): # Show progress stats.showProgress(epoch) # If we have a teacher, we use that model to get new targets if len(teach_funcs) > 0: target_batch = np.zeros((len(teach_funcs), target_batch.shape[0], target_batch.shape[1]), dtype='float32') for i in range(len(teach_funcs)): target_batch[i] = teach_funcs[i](image_batch) target_batch = np.mean(target_batch, axis=0) # Calling the training functions returns the current loss loss = train_net(image_batch, target_batch, lr.dynamicLearningRate(cfg.LR_SCHEDULE, epoch)) stats.setValue('train loss', loss, 'append') stats.setValue('batch_count', 1, 'add') # Stop? if cfg.DO_BREAK: break # Iterate over VAL batches of images for image_batch, target_batch in bg.nextBatch(VAL, False, True): # Calling the test function returns the net output, loss and accuracy prediction_batch, loss, acc = test_net(image_batch, target_batch) stats.setValue('val loss', loss, 'append') stats.setValue('val acc', acc, 'append') stats.setValue('batch_count', 1, 'add') stats.setValue('lrap', [metrics.lrap(prediction_batch, target_batch)], 'add') # Show progress stats.showProgress(epoch) # Stop? if cfg.DO_BREAK: break # Show stats for epoch stats.showProgress(epoch, done=True) stats.toc('epoch_time') log.r(('TRAIN LOSS:', np.mean(stats.getValue('train loss'))), new_line=False) log.r(('VAL LOSS:', np.mean(stats.getValue('val loss'))), new_line=False) log.r(('VAL ACC:', int(np.mean(stats.getValue('val acc')) * 10000) / 100.0, '%'), new_line=False) log.r(('MLRAP:', int(np.mean(stats.getValue('lrap')) * 1000) / 1000.0), new_line=False) log.r(('TIME:', stats.getValue('epoch_time'), 's')) # Save snapshot? if not epoch % cfg.SNAPSHOT_EPOCHS: io.saveModel(NET, cfg.CLASSES, epoch) print('vish') io.saveParams(NET, cfg.CLASSES, epoch) # New best net? if np.mean(stats.getValue('lrap')) > stats.getValue('best_mlrap'): stats.setValue('best_net', NET, static=True) stats.setValue('best_epoch', epoch, static=True) stats.setValue('best_mlrap', np.mean(stats.getValue('lrap')), static=True) # Early stopping? if epoch - stats.getValue('best_epoch') >= cfg.EARLY_STOPPING_WAIT: log.i('EARLY STOPPING!') break # Stop? if cfg.DO_BREAK: break except KeyboardInterrupt: log.i('KeyboardInterrupt') cfg.DO_BREAK = True break # Status log.i('TRAINING DONE!') log.r(('BEST MLRAP:', stats.getValue('best_mlrap'), 'EPOCH:', stats.getValue('best_epoch'))) # Save best model and return io.saveParams(stats.getValue('best_net'), cfg.CLASSES, stats.getValue('best_epoch')) print('in training vish') return io.saveModel(stats.getValue('best_net'), cfg.CLASSES, stats.getValue('best_epoch'))