def findNoisesegments(self, dirName): ''' dirName got manually annotated GT.data Generates auto segments by running wavelet detection Find noise segments by diff of auto segments and GT.data :returns noise segments [[filename, seg, label], ...] ''' manSegNum = 0 noiseSegments = [] # Generate GT files from annotations in dir1 print('Generating GT...') for root, dirs, files in os.walk(dirName): for file in files: wavFile = os.path.join(root, file) if file.lower().endswith('.wav') and os.stat( wavFile).st_size != 0 and file + '.data' in files: segments = Segment.SegmentList() segments.parseJSON(wavFile + '.data') sppSegments = segments.getSpecies(self.species) manSegNum += len(sppSegments) # Currently, we ignore call types here and just # look for all calls for the target species. segments.exportGT(wavFile, self.species, resolution=1.0) if manSegNum == 0: print("ERROR: no segments for species %s found" % self.species) return ws = WaveletSegment.WaveletSegment(self.filter, 'dmey2') autoSegments = ws.waveletSegment_cnn( dirName, self.filter) # [(filename, [segments]), ...] # now the diff between segment and autoSegments print("autoSeg", autoSegments) for item in autoSegments: print(item[0]) wavFile = item[0] if os.stat(wavFile).st_size != 0: sppSegments = [] if os.path.isfile(wavFile + '.data'): segments = Segment.SegmentList() segments.parseJSON(wavFile + '.data') sppSegments = [ segments[i] for i in segments.getSpecies(self.species) ] for segAuto in item[1]: overlappedwithGT = False for segGT in sppSegments: if self.Overlap(segGT, segAuto): overlappedwithGT = True break if not overlappedwithGT: noiseSegments.append( [wavFile, segAuto, len(self.calltypes)]) return noiseSegments
def findCTsegments(self, dirName, calltypei): ''' dirName got reviewed.data or manual.data Find calltype segments :returns ct segments [[filename, seg, label], ...] ''' calltypeSegments = [] for root, dirs, files in os.walk(dirName): for file in files: wavFile = os.path.join(root, file) if file.lower().endswith('.wav') and file + '.data' in files: segments = Segment.SegmentList() segments.parseJSON(wavFile + '.data') if len(self.calltypes) == 1: ctSegments = segments.getSpecies(self.species) else: ctSegments = segments.getCalltype( self.species, self.calltypes[calltypei]) for indx in ctSegments: seg = segments[indx] # skip uncertain segments cert = [ lab["certainty"] if lab["species"] == self.species else 100 for lab in seg[4] ] if cert: mincert = min(cert) if mincert == 100: calltypeSegments.append( [wavFile, seg[:2], calltypei]) return calltypeSegments
def findAllsegments(self, dirName): ''' dirName got manually annotated GT.data Generates noise segments as the complement to GT segments (i.e. every not marked second is used as noise) :returns noise segments [[filename, seg, label], ...] ''' manSegNum = 0 noiseSegments = [] segmenter = Segment.Segmenter() print('Generating GT...') for root, dirs, files in os.walk(dirName): for file in files: wavFile = os.path.join(root, file) if file.lower().endswith('.wav') and os.stat( wavFile).st_size != 0 and file + '.data' in files: # Generate GT files from annotations in dir1 segments = Segment.SegmentList() segments.parseJSON(wavFile + '.data') sppSegments = segments.getSpecies(self.species) manSegNum += len(sppSegments) # Currently, we ignore call types here and just # look for all calls for the target species. segments.exportGT(wavFile, self.species, resolution=1.0) print('Determining noise...') autoseg = Segment.SegmentList() for sec in range( math.floor(segments.metadata["Duration"]) - 1): autoseg.addSegment([sec, sec + 1, 0, 0, []]) autoSegments = segmenter.joinGaps(autoseg, maxgap=0) print("autoSeg, file", wavFile, autoSegments) for segAuto in autoSegments: noiseSegments.append( [wavFile, segAuto, len(self.calltypes)]) if manSegNum == 0: print("ERROR: no segments for species %s found" % self.species) return return noiseSegments
def findCTsegments(self, datafile, calltypei): calltypeSegments = [] species = self.currfilt["species"] segments = Segment.SegmentList() segments.parseJSON(datafile) if len(self.calltypes) == 1: ctSegments = segments.getSpecies(species) else: ctSegments = segments.getCalltype(species, self.calltypes[calltypei]) calltypeSegments = [segments[indx][:2] for indx in ctSegments] return calltypeSegments
def findCTsegments(self, file, calltypei): calltypeSegments = [] if file.lower().endswith('.wav') and os.path.isfile(file + '.tmpdata'): segments = Segment.SegmentList() segments.parseJSON(file + '.tmpdata') if len(self.calltypes) == 1: ctSegments = segments.getSpecies(self.species) else: ctSegments = segments.getCalltype(self.species, self.calltypes[calltypei]) for indx in ctSegments: seg = segments[indx] calltypeSegments.append(seg[:2]) return calltypeSegments
def __init__(self, testDir, currfilt, filtname, configdir, filterdir, CLI=False): """ currfilt: the recognizer to be used (dict) """ self.testDir = testDir self.outfile = open(os.path.join(self.testDir, "test-results.txt"), "w") self.currfilt = currfilt self.filtname = filtname self.configdir = configdir self.filterdir = filterdir # Note: this is just the species name, unlike the self.species in Batch mode species = self.currfilt['species'] self.sampleRate = self.currfilt['SampleRate'] self.calltypes = [] for fi in self.currfilt['Filters']: self.calltypes.append(fi['calltype']) self.outfile.write("Recogniser name: %s\n" % (filtname)) self.outfile.write("Species name: %s\n" % (species)) self.outfile.write("Using data: %s\n" % (self.testDir)) # 0. Generate GT files from annotations in test folder self.manSegNum = 0 self.window = 1 inc = None print('Generating GT...') for root, dirs, files in os.walk(self.testDir): for file in files: wavFile = os.path.join(root, file) if file.lower().endswith('.wav') and os.stat( wavFile).st_size != 0 and file + '.data' in files: segments = Segment.SegmentList() segments.parseJSON(wavFile + '.data') self.manSegNum += len(segments.getSpecies(species)) # Currently, we ignore call types here and just # look for all calls for the target species. segments.exportGT(wavFile, species, resolution=self.window) if self.manSegNum == 0: print("ERROR: no segments for species %s found" % species) self.text = 0 return # 1. Run Batch Processing upto WF and generate .tempdata files (no post-proc) avianz_batch = AviaNZ_batch.AviaNZ_batchProcess( parent=None, configdir=self.configdir, mode="test", sdir=self.testDir, recogniser=filtname, wind=1) # NOTE: will use wind-robust detection # 2. Report statistics of WF followed by general post-proc steps (no CNN but wind-merge neighbours-delete short) self.text = self.getSummary(CNN=False) # 3. Report statistics of WF followed by post-proc steps (wind-CNN-merge neighbours-delete short) if "CNN" in self.currfilt: cl = SupportClasses.ConfigLoader() filterlist = cl.filters(self.filterdir, bats=False) CNNDicts = cl.CNNmodels(filterlist, self.filterdir, [filtname]) # Providing one filter, so only one CNN should be returned: if len(CNNDicts) != 1: print("ERROR: Couldn't find a unique matching CNN!") self.outfile.write("No matching CNN found!\n") self.outfile.write("-- End of testing --\n") self.outfile.close() return CNNmodel = list(CNNDicts)[0] self.text = self.getSummary(CNN=True) self.outfile.write("-- End of testing --\n") self.outfile.close() print("Testing output written to " + os.path.join(self.testDir, "test-results.txt"))
def splitData(self, infile, outdir, cutlen): """ Args: input filename, output folder, split duration. Determines the original input length from the metadata segment[1]. """ print("Splitting data file", infile) segs = Segment.SegmentList() try: segs.parseJSON(infile) except Exception as e: print(e) print("ERROR: could not parse file", infile) return infile = os.path.basename(infile)[:-9] try: outprefix = '_'.join(infile.split("_")[:-2]) datestamp = infile.split("_")[-2:] # get [date, time] datestamp = '_'.join(datestamp) # make "date_time" try: time = dt.datetime.strptime(datestamp, "%Y%m%d_%H%M%S") except ValueError: time = dt.datetime.strptime(datestamp, "%y%m%d_%H%M%S") print(infile, "identified as timestamp", time) except ValueError: outprefix = infile print("Could not identify timestamp in", infile) time = 0 maxtime = segs.metadata["Duration"] if maxtime <= 0: print("ERROR: bad audio duration %s read from .data" % maxtime) return elif maxtime > 24 * 3600: print("ERROR: audio duration %s in .data exceeds 24 hr limit" % maxtime) return # repeat initial meta-segment for each output file # (output is determined by ceiling division) all = [] for i in range(int(maxtime - 1) // cutlen + 1): onelist = Segment.SegmentList() onelist.metadata = segs.metadata.copy() onelist.metadata["Duration"] = min(self.cutLen, maxtime - i * self.cutLen) all.append(onelist) # separate segments into output files and adjust segment timestamps for b in segs: filenum, adjst = divmod(b[0], cutlen) adjend = b[1] - filenum * cutlen # a segment can jut out past the end of a split file, so we trim it: # [a------|---b] -> [a-----f1end] [f2start----b] # If it's super long, it'll go back to the list to be trimmed again. if adjend > cutlen: print("trimming segment") # cut at the end of the starting file adjend = (filenum + 1) * cutlen # keep rest for later segs.append([adjend, b[1], b[2], b[3], b[4]]) all[int(filenum)].addSegment([adjst, adjend, b[2], b[3], b[4]]) # save files, while increasing the filename datestamps for a in range(len(all)): if time != 0: f2 = str(outprefix) + '_' + dt.datetime.strftime( time, "%Y%m%d_%H%M%S") + '.wav.data' f2 = os.path.join(outdir, f2) print("outputting to", f2) time = time + dt.timedelta(seconds=cutlen) else: f2 = str(outprefix) + '_' + str(a) + '.wav.data' f2 = os.path.join(outdir, f2) print("outputting to", f2) all[a].saveJSON(f2)
def cluster_by_dist(dir, feature='we', n_mels=24, fs=0, minlen=0.2, f_1=0, f_2=0, denoise=False, single=False, distance='dtw', max_clusters=10): """ Given wav + annotation files, 1) identify syllables using median clipping/ FIR 2) generate features WE/MFCC/chroma 3) calculate DTW distances and decide class/ generate new class :param dir: directory of audio and annotations :param feature: 'WE' or 'MFCC' or 'chroma' :param n_mels: number of mel coefs for MFCC :param fs: prefered sampling frequency, 0 leads to calculate it from the anotations :param minlen: min syllable length in secs :param f_1: lower frequency bound, 0 leads to calculate it from the anotations :param f_2: upper frequency bound, 0 leads to calculate it from the anotations :param denoise: wavelet denoise :param single: True means when there are multiple syllables in a segment, add only one syllable to the cluster info :param distance: 'dtw' or 'xcor' :return: possible clusters """ import Segment import SignalProc from scipy import signal # Get flow and fhigh for bandpass from annotations lowlist = [] highlist = [] srlist = [] for root, dirs, files in os.walk(str(dir)): for file in files: if file.endswith('.wav') and file + '.data' in files: wavobj = wavio.read(os.path.join(root, file)) srlist.append(wavobj.rate) # Read the annotation segments = Segment.SegmentList() segments.parseJSON(os.path.join(root, file + '.data')) for seg in segments: lowlist.append(seg[2]) highlist.append(seg[3]) print(lowlist) print(highlist) print(srlist) if f_1 == 0: f_1 = np.min(lowlist) if f_2 == 0: f_2 = np.median(highlist) if fs == 0: arr = [4000, 8000, 16000] pos = np.abs(arr - np.median(highlist) * 2).argmin() fs = arr[pos] print('fs: ', fs) if fs > np.min(srlist): print(fs) fs = np.min(srlist) if fs < f_2 * 2 + 50: f_2 = fs // 2 - 50 minlen_samples = minlen * fs print('Frequency band:', f_1, '-', f_2) print('fs: ', fs) # Find the lower and upper bounds (relevant to the frq range), when the range is given if feature == 'mfcc' and f_1 != 0 and f_2 != 0: mels = librosa.core.mel_frequencies(n_mels=n_mels, fmin=0.0, fmax=fs / 2, htk=False) ind_flow = (np.abs(mels - f_1)).argmin() ind_fhigh = (np.abs(mels - f_2)).argmin() elif feature == 'we' and f_1 != 0 and f_2 != 0: linear = np.linspace(0, fs / 2, 62) ind_flow = (np.abs(linear - f_1)).argmin() ind_fhigh = (np.abs(linear - f_2)).argmin() # Ready for clustering max_clusters = max_clusters n_clusters = 0 clusters = [] for root, dirs, files in os.walk(str(dir)): for file in files: if file.endswith('.wav') and file + '.data' in files: # Read the annotation segments = Segment.SegmentList() segments.parseJSON(os.path.join(root, file + '.data')) # Sort the segments longest to shortest, would be a good idea to avoid making first class with only # one member :) if len(segments) > 0 and segments[0][0] == -1: del segments[0] segments_len = [seg[1] - seg[0] for seg in segments] inds = np.argsort(segments_len)[::-1] sortedsegments = [segments[i] for i in inds] # Now find syllables within each segment, median clipping for seg in sortedsegments: if seg[0] == -1: continue audiodata, sr = loadFile(filename=os.path.join(root, file), duration=seg[1] - seg[0], offset=seg[0], fs=fs, denoise=denoise, f1=f_1, f2=f_2) start = int(seg[0] * fs) sp = SignalProc.SignalProc(audiodata, fs, 256, 128) sgRaw = sp.spectrogram(audiodata, 256, 128) segment = Segment.Segmenter(data=audiodata, sg=sgRaw, sp=sp, fs=fs, window_width=256, incr=128) syls = segment.medianClip(thr=3, medfiltersize=5, minaxislength=9, minSegment=50) if len(syls) == 0: # Try again with FIR syls = segment.segmentByFIR(threshold=0.05) syls = segment.checkSegmentOverlap( syls) # merge overlapped segments syls = [[int(s[0] * sr), int(s[1] * fs)] for s in syls] if len( syls ) == 0: # Sanity check, when annotating syllables tight, syls = [[0, int((seg[1] - seg[0]) * fs)] ] # median clipping doesn't detect it. if len(syls) > 1: # TODO: samples to seconds syls = segment.joinGaps( syls, minlen_samples) # Merge short segments if len(syls) == 1 and syls[0][1] - syls[0][ 0] < minlen_samples: # Sanity check syls = [[0, int((seg[1] - seg[0]) * fs)]] temp = [[ np.round((x[0] + start) / fs, 2), np.round((x[1] + start) / fs, 2) ] for x in syls] print('\nCurrent:', seg, '--> syllables >', minlen, 'secs ', temp) # Calculate features of the syllables in the current segment. f = [] for s in syls: data = audiodata[s[0]:s[1]] if feature == 'mfcc': # MFCC mfcc = librosa.feature.mfcc(y=data, sr=fs, n_mfcc=n_mels) if f_1 != 0 and f_2 != 0: mfcc = mfcc[ ind_flow: ind_fhigh, :] # Limit the frequency to the fixed range [f_1, f_2] mfcc_delta = librosa.feature.delta(mfcc, mode='nearest') mfcc = np.concatenate((mfcc, mfcc_delta), axis=0) mfcc = scale(mfcc, axis=1) # librosa.display.specshow(mfcc, sr=fs, x_axis='time') # m = [i for sublist in mfcc for i in sublist] f.append(mfcc) elif feature == 'we': # Wavelet Energy ws = WaveletSegment.WaveletSegment(spInfo=[]) we = ws.computeWaveletEnergy(data=data, sampleRate=fs, nlevels=5, wpmode='new') we = we.mean(axis=1) if f_1 != 0 and f_2 != 0: we = we[ ind_flow: ind_fhigh] # Limit the frequency to a fixed range f_1, f_2 f.append(we) elif feature == 'chroma': chroma = librosa.feature.chroma_cqt(y=data, sr=fs) # chroma = librosa.feature.chroma_stft(y=data, sr=fs) chroma = scale(chroma, axis=1) f.append(chroma) matched = False if n_clusters == 0: print('**Case 1: First class') newclass = class_create(label=n_clusters, syl=syls, features=f, f_low=seg[2], f_high=seg[3], segs=[ (os.path.join(root, file), seg) ], single=single, dist_method=distance) clusters.append(newclass) n_clusters += 1 print('Created new class: Class ', "'", newclass["label"], "'", ',\tIn-class_d: ', newclass["d"], '\tf_low: ', newclass["f_low"], '\tf_high: ', newclass["f_high"]) matched = True if not matched: # See if the syllables in the current seg match with any existing class min_ds = [ ] # Keep track of the minimum distances to each class clusters = random.sample(clusters, len( clusters)) # Shuffle the clusters to avoid bias for c in range(len(clusters)): f_c = clusters[c][ "features"] # features of the current class c dist_c = np.zeros( (len(f_c), len(f))) # distances to the current class c for i in range(len(f_c)): for j in range(len(f)): if distance == 'dtw': d, _ = librosa.sequence.dtw( f_c[i], f[j], metric='euclidean') dist_c[i, j] = d[d.shape[0] - 1][d.shape[1] - 1] elif distance == 'xcor': corr = signal.correlate(f_c[i], f[j], mode='full') dist_c[i, j] = np.sum(corr) / max( len(f_c[i]), len(f[j])) # Min distance to the current class print('Distance to Class ', clusters[c]["label"], ': ', np.amin(dist_c[dist_c != 0]), '( In-class distance: ', clusters[c]["d"], ')') min_ds.append(np.amin(dist_c[dist_c != 0])) # Now get the clusters sorted according to the min dist ind = np.argsort(min_ds) min_ds = np.sort(min_ds) # make the cluster order clusters = [clusters[i] for i in ind] for c in range(len(clusters)): if (clusters[c]["d"] != 0) and min_ds[c] < ( clusters[c]["d"] + clusters[c]["d"] * 0.1): print( '**Case 2: Found a match with a class > one syllable' ) print('Class ', clusters[c]["label"], ', dist ', min_ds[c]) # Update this class clusters[c] = class_update( cluster=clusters[c], newfeatures=f, newf_low=seg[2], newf_high=seg[3], newsyl=syls, newseg=(os.path.join(root, file), seg), single=single, dist_method=distance) matched = True break # found a match, exit from the for loop, go to the next segment elif c < len(clusters) - 1: continue # continue to the next class # Checked most of the classes by now, if still no match found, check the classes with only one # data point (clusters[c]["d"] == 0). # Note the arbitrary thr. if not matched: if distance == 'dtw': thr = 25 elif distance == 'xcor': thr = 1000 for c in range(len(clusters)): if clusters[c]["d"] == 0 and min_ds[c] < thr: print('**Case 3: In-class dist of ', clusters[c]["label"], '=', clusters[c]["d"], 'and this example < ', thr, ' dist') print('Class ', clusters[c]["label"], ', dist ', min_ds[c]) # Update this class clusters[c] = class_update( cluster=clusters[c], newfeatures=f, newf_low=seg[2], newf_high=seg[3], newsyl=syls, newseg=(os.path.join(root, file), seg), single=single, dist_method=distance) matched = True break # Break the search and go to the next segment # If no match found yet, check the max clusters if not matched: if n_clusters == max_clusters: print( '**Case 4: Reached max classes, therefore adding current seg to the closest ' 'class... ') # min_ind = np.argmin(min_ds) # classes are sorted in ascending order of distance already for c in range(len(clusters)): if min_ds[c] <= 4 * clusters[c][ "d"] or clusters[c]["d"] == 0: print('Class ', clusters[c]["label"], ', dist ', min_ds[c], '(in-class distance:', clusters[c]["d"], ')') # Update this class clusters[c] = class_update( cluster=clusters[c], newfeatures=f, newf_low=seg[2], newf_high=seg[3], newsyl=syls, newseg=(os.path.join(root, file), seg), single=single, dist_method=distance) matched = True break if not matched: print('Class ', clusters[0]["label"], ', dist ', min_ds[0], '(in-class distance:', clusters[0]["d"], ')') # Update this class # TODO: don't update the class as it is an outlier? clusters[0] = class_update( cluster=clusters[0], newfeatures=f, newf_low=seg[2], newf_high=seg[3], newsyl=syls, newseg=(os.path.join(root, file), seg), single=single, dist_method=distance) matched = True continue # Continue to next segment # If still no luck, create a new class if not matched: print('**Case 5: None of Case 1-4') newclass = class_create(label=n_clusters, syl=syls, features=f, f_low=seg[2], f_high=seg[3], segs=[ (os.path.join(root, file), seg) ], single=single, dist_method=distance) print('Created a new class: Class ', n_clusters + 1) clusters.append(newclass) n_clusters += 1 print('Created new class: Class ', "'", newclass["label"], "'", ',\tin-class_d: ', newclass["d"], '\tf_low: ', newclass["f_low"], '\tf_high: ', newclass["f_high"]) print('\n\n--------------Clusters created-------------------') clustered_segs = [] for c in range(len(clusters)): print('Class ', clusters[c]['label'], ': ', len(clusters[c]['segs'])) for s in range(len(clusters[c]['segs'])): print('\t', clusters[c]['segs'][s]) if single: clustered_segs.append([ clusters[c]['segs'][s][0], clusters[c]['segs'][s][1], [clusters[c]['features'][s]], clusters[c]['label'] ]) else: clustered_segs.append([ clusters[c]['segs'][s][0], clusters[c]['segs'][s][1], clusters[c]['label'] ]) # Clustered segments print( '\n\n################### Clustered segments ############################' ) for s in clustered_segs: print(s) return clustered_segs, fs, n_clusters
# audiodata = sp.ButterworthBandpass(audiodata, sampleRate, f1, f2) audiodata = sp.bandpassFilter(audiodata, sampleRate, f1, f2) return audiodata dir = "E:\ClusterData\BrownKiwi\Train" species = None fs = 16000 dataset = [] if os.path.isdir(dir): for root, dirs, files in os.walk(str(dir)): for file in files: if file.endswith('.wav') and file + '.data' in files: # Read the annotation segments = Segment.SegmentList() segments.parseJSON(os.path.join(root, file + '.data')) if species: thisSpSegs = segments.getSpecies(species) else: thisSpSegs = np.arange(len(segments)).tolist() # Now find syllables within each segment, median clipping for segix in thisSpSegs: seg = segments[segix] # Find the GT label for the syllables from this segment # 0 - brown kiwi, male # 1 - brown kiwi, female # 2 - LSK, male # 3 - LSK, female # 4 - morepork, more-pork
def loadFile(self, species, anysound=False): print(self.filename) # Create an instance of the Signal Processing class if not hasattr(self, 'sp'): self.sp = SignalProc.SignalProc(self.config['window_width'], self.config['incr']) # Read audiodata or spectrogram if self.method == "Wavelets": self.sp.readWav(self.filename) self.sampleRate = self.sp.sampleRate self.audiodata = self.sp.data self.datalength = np.shape(self.audiodata)[0] print("Read %d samples, %f s at %d Hz" % (len(self.audiodata), float(self.datalength) / self.sampleRate, self.sampleRate)) else: self.sp.readBmp(self.filename, rotate=False) self.sampleRate = self.sp.sampleRate self.datalength = self.sp.fileLength # Read in stored segments (useful when doing multi-species) self.segments = Segment.SegmentList() if species == [ "Any sound" ] or not os.path.isfile(self.filename + '.data') or self.method == "Click": # Initialize default metadata values self.segments.metadata = dict() self.segments.metadata["Operator"] = "Auto" self.segments.metadata["Reviewer"] = "" self.segments.metadata["Duration"] = float( self.datalength) / self.sampleRate # wipe all segments: print("Wiping all previous segments") self.segments.clear() else: self.segments.parseJSON(self.filename + '.data', float(self.datalength) / self.sampleRate) # wipe same species: for sp in species: # shorthand for double-checking that it's not "Any Sound" etc if sp in self.FilterDicts: spname = self.FilterDicts[sp]["species"] print("Wiping species", spname) oldsegs = self.segments.getSpecies(spname) for i in reversed(oldsegs): wipeAll = self.segments[i].wipeSpecies(spname) if wipeAll: del self.segments[i] print("%d segments loaded from .data file" % len(self.segments)) if self.method != "Click": # Do impulse masking by default if anysound: self.sp.data = self.sp.impMask(engp=70, fp=0.50) else: self.sp.data = self.sp.impMask() self.audiodata = self.sp.data del self.sp gc.collect()
def mainloop(self, allwavs, total, speciesStr, filters, settings): # MAIN PROCESSING starts here processingTime = 0 cleanexit = 0 cnt = 0 timeWindow_s = settings[1] timeWindow_e = settings[2] for filename in allwavs: # get remaining run time in min processingTimeStart = time.time() hh, mm = divmod(processingTime * (total - cnt) / 60, 60) cnt = cnt + 1 progrtext = "file %d / %d. Time remaining: %d h %.2f min" % ( cnt, total, hh, mm) print("*** Processing" + progrtext + " ***") if not self.CLI and not self.testmode: self.ui.statusBar().showMessage("Processing " + progrtext) self.ui.update() # if it was processed previously (stored in log) if filename in self.filesDone: # skip the processing: print("File %s processed previously, skipping" % filename) continue # check if file not empty if os.stat(filename).st_size < 1000: print("File %s empty, skipping" % filename) if not self.testmode: self.log.appendFile(filename) continue # check if file is formatted correctly with open(filename, 'br') as f: if (self.method == "Click" and f.read(2) != b'BM') or (self.method != "Click" and f.read(4) != b'RIFF'): print( "Warning: file %s not formatted correctly, skipping" % filename) self.log.appendFile(filename) continue # test the selected time window if it is a doc recording DOCRecording = re.search('(\d{6})_(\d{6})', os.path.basename(filename)) if DOCRecording: startTime = DOCRecording.group(2) sTime = int(startTime[:2]) * 3600 + int( startTime[2:4]) * 60 + int(startTime[4:6]) if timeWindow_s == timeWindow_e: # (no time window set) inWindow = True elif timeWindow_s < timeWindow_e: # for day times ("8 to 17") inWindow = (sTime >= timeWindow_s and sTime <= timeWindow_e) else: # for times that include midnight ("17 to 8") inWindow = (sTime >= timeWindow_s or sTime <= timeWindow_e) else: inWindow = True if DOCRecording and not inWindow: print("Skipping out-of-time-window recording") if not self.testmode: self.log.appendFile(filename) continue # ALL SYSTEMS GO: process this file self.filename = filename self.segments = Segment.SegmentList() if self.method == "Intermittent sampling": try: self.addRegularSegments() except Exception as e: e = "Encountered error:\n" + traceback.format_exc() print("ERROR: ", e) if not self.CLI: self.ui.error_fileproc(total, e) self.log.file.close() return (1) else: # load audiodata/spectrogram and clean up old segments: print("Loading file...") self.loadFile(species=self.species, anysound=(speciesStr == "Any sound")) # initialize empty segmenter if self.method == "Wavelets": self.ws = WaveletSegment.WaveletSegment(wavelet='dmey2') # Main work is done here: try: print("Segmenting...") self.detectFile(speciesStr, filters) except Exception: e = "Encountered error:\n" + traceback.format_exc() print("ERROR: ", e) if not self.CLI and not self.testmode: self.ui.error_fileproc(total, e) self.log.file.close() return (1) print('Segments in this file: ', self.segments) # export segments print("%d new segments marked" % len(self.segments)) cleanexit = self.saveAnnotation() if cleanexit != 1: print("Warning: could not save segments!") # Log success for this file and update ProgrDlg if not self.testmode: self.log.appendFile(filename) if not self.CLI: response = self.ui.update_progress(cnt, total + 1, progrtext) if response == 2: print("Analysis cancelled") self.log.file.close() # track how long it took to process one file: processingTime = time.time() - processingTimeStart print("File processed in", processingTime)
def __init__(self,testDir,currfilt,configdir,filterdir,CLI=False): self.testDir = testDir self.outfile = open(os.path.join(self.testDir, "test-results.txt"),"w") if CLI: cl = SupportClasses.ConfigLoader() self.FilterDict = cl.filters(filterdir, bats=False) if currfilt.lower().endswith('.txt'): self.currfilt = self.FilterDict[currfilt[:-4]] else: self.currfilt = self.FilterDict[currfilt] else: self.currfilt = currfilt self.configdir = configdir self.filterdir = filterdir self.species = self.currfilt['species'] self.sampleRate = self.currfilt['SampleRate'] self.calltypes = [] for fi in self.currfilt['Filters']: self.calltypes.append(fi['calltype']) self.outfile.write("Recogniser name: %s\n" %(self.currfilt)) self.outfile.write("Species name: %s\n" % (self.species)) self.outfile.write("Using data: %s\n" % (self.testDir)) # 0. Generate GT files from annotations in test folder self.manSegNum = 0 self.window = 1 inc = None print('Generating GT...') for root, dirs, files in os.walk(self.testDir): for file in files: wavFile = os.path.join(root, file) if file.lower().endswith('.wav') and os.stat(wavFile).st_size != 0 and file + '.data' in files: segments = Segment.SegmentList() segments.parseJSON(wavFile + '.data') self.manSegNum += len(segments.getSpecies(self.species)) # Currently, we ignore call types here and just # look for all calls for the target species. segments.exportGT(wavFile, self.species, window=self.window, inc=inc) if self.manSegNum == 0: print("ERROR: no segments for species %s found" % self.species) self.flag = False self.text = 0 return # 1. Run Batch Processing upto WF and generate .tempdata files (no post-proc) avianz_batch = AviaNZ_batch.AviaNZ_batchProcess(parent=None, configdir=self.configdir, mode="test", sdir=self.testDir, recogniser=self.species, wind=True) # 2. Report statistics of WF followed by general post-proc steps (no CNN but wind-merge neighbours-delete short) self.flag, self.text = self.getSummary(avianz_batch, CNN=False) # 3. Report statistics of WF followed by post-proc steps (wind-CNN-merge neighbours-delete short) if "CNN" in self.currfilt: cl = SupportClasses.ConfigLoader() filterlist = cl.filters(self.filterdir, bats=False) CNNDicts = cl.CNNmodels(filterlist, self.filterdir, [self.species]) if self.species in CNNDicts.keys(): CNNmodel = CNNDicts[self.species] flag, text = self.getSummary(avianz_batch, CNN=True, CNNmodel=CNNmodel) else: print("Couldn't find a matching CNN!") self.outfile.write("-- End of testing --\n") self.outfile.close() return self.outfile.write("-- End of testing --\n") self.outfile.close() # Tidy up for root, dirs, files in os.walk(self.testDir): for file in files: if file.endswith('.tmpdata'): os.remove(os.path.join(root, file)) if CLI: print("Output written to " + os.path.join(self.testDir, "test-results.txt"))
def checkInputDir(self): """ Checks the input file dir filenames etc. for validity Returns an error code if the specified directory is bad. """ if not os.path.isdir(self.dirName): print("ERROR: directory %s doesn't exist" % self.dirName) return(1) # list all datas that will be processed alldatas = [] try: for root, dirs, files in os.walk(str(self.dirName)): for filename in files: if filename.lower().endswith('.wav.data'): alldatas.append(os.path.join(root, filename)) except Exception as e: print("ERROR: could not load dir %s" % self.dirName) print(e) return(1) # TODO Note: this is a bit fidgety as needs to be adapted to each survey recsInDirNames = False defaultToDMY = True # read in all datas to self.annots for f in alldatas: # must have correct naming format: infilestem = os.path.basename(f)[:-9] try: if recsInDirNames: recname = os.path.basename(os.path.normpath(os.path.dirname(f))) filedate, filetime = infilestem.split("_") # get [date, time] else: recname, filedate, filetime = infilestem.split("_") # get [rec, date, time] datestamp = filedate + '_' + filetime # make "date_time" # check both 4-digit and 2-digit codes (century that produces closest year to now is inferred) if len(filedate)==8: d = dt.datetime.strptime(datestamp, "%Y%m%d_%H%M%S") else: if defaultToDMY: try: d = dt.datetime.strptime(datestamp, "%d%m%y_%H%M%S") except ValueError: d = dt.datetime.strptime(datestamp, "%y%m%d_%H%M%S") else: try: d = dt.datetime.strptime(datestamp, "%y%m%d_%H%M%S") except ValueError: d = dt.datetime.strptime(datestamp, "%d%m%y_%H%M%S") print("Recorder ", recname, " timestamp ", d) # timestamp identified, so read this file: segs = Segment.SegmentList() try: segs.parseJSON(f, silent=True) except Exception as e: print("Warning: could not read file %s" % f) print(e) continue # store the wav filename segs.wavname = f[:-5] segs.recname = recname segs.datetime = d self.annots.append(segs) # also keep track of the different recorders self.allrecs.add(recname) except ValueError: print("Could not identify timestamp in", f) continue print("Detected recorders:", self.allrecs) return(0)