def analyse_corpus_files(corpus): """ Reusable script function for analysing the audio files comprising a corpus. """ file_list = corpus.list_audio_files() analyser = EssentiaAnalyser() for audio_file in file_list: try: analyser.analyse_audio(audio_file) except EssentiaError, e: log.error("Essentia threw an error (%s), skipping this one: '%s'" % (e, audio_file))
def gaia_transform(points): """ Takes a dict of point names and filepaths. Creates a DataSet and performs the standard transformations """ ds = DataSet.mergeFiles(points) ds = transform(ds, 'fixlength') ds = transform(ds, 'cleaner') for desc in get_unused_descriptors(): try: ds = transform(ds, 'remove', desc) except Exception, e: log.error("Problem removing this descriptor: %s" % e)
def process_highlevel(corpus, filepath, chop): """ Utility method used to test the hierarchical system and create high level segments along with analysis and constituent units stored in a gaia dataset for a given ``corpus`` and file from that ``corpus`` . """ units = corpus.list_audio_units(audio_filename=filepath, chop=chop) new_segments = [] m = Mosaic() analyser = EssentiaAnalyser() for u in units: if not os.path.isfile(switch_ext(u, '.yaml')): log.error("Cannot find analysis, assume that this file is silent: '%s'" % u) continue m.add_unit(MosaicUnit(u)) log.debug("Number of units in mosaic is: %d, length of mosaic is %d" % (len(m.units), m.length)) # This AND clause prevents a Gaia Dataset of only 1 Point being created. # The problem is that if the dataset only has one point then all # descriptors get removed during 'cleaner' analysis if m.length > 5 and len(m.units) > 1: log.debug("Current length of mosaic is %f, adding to list" % m.length) new_segments.append(m) m = Mosaic() if len(new_segments) == 0: log.warn("Retrieved only 1 high level mosaic unit of length: %f" % m.length) new_segments.append(m) if m != new_segments[-1]: log.debug("The final mosaic is of length %f only" % new_segments[-1].length) log.debug("The last mosaic has a length of: %f" % m.length) m.merge_mosaics(new_segments[-1]) new_segments[-1] = m log.debug("Finished assembling units into segments of > 5s") log.debug("There are %d segments in total to be processed for %s" % (len(new_segments), os.path.basename(filepath))) highlevel_dir = corpus._make_segments_dir(filepath, 'highlevel_%s' % chop) for index, seg in enumerate(new_segments): path = os.path.join(highlevel_dir, '%05d.wav' % index) seg.export(path) log.debug("Analysing audio: %s" % path) analyser.analyse_audio(path) unit_dict = {} log.debug("Segment has %d units" % len(seg.units)) for unit in seg.units: #path_comps = os.path.split(os.path.dirname(unit.filepath)) #chop_dir = path_comps[1] #name = os.path.split(path_comps[0])[1] + '_' + chop_dir + \ # '_' + switch_ext(os.path.basename(unit.filepath), '') unit_dict.update({switch_ext(unit.filepath, '.yaml'): switch_ext(unit.filepath, '.yaml')}) tu_ds = gaia_transform(unit_dict) tu_ds.save(os.path.join(highlevel_dir, '%05d.db' % index))
def set_bin(self, bin_name): """ Allows dynamic switching of binary analysers, e.g. for analysing solely low level features one can enjoy a much faster analysis process by switching the analyser binary. The only constraint is that the new binary must be stored in the same directory as the default setting (read from settings.py) """ new_bin = os.path.join(settings.ESSENTIA_BIN_DIR, bin_name) if os.path.isfile(new_bin): log.info("Using new essentia bin: '%s'" % new_bin) self.ESSENTIA_BIN = new_bin else: log.error("'%s' is not a file, keeping original: '%s'" % (new_bin, self.ESSENTIA_BIN))
""" files = [f for f in glob('*.wav')] for af in files: log.debug("Found a wav file: '%s'" % af) nf = strip_all(af) os.rename(af, nf) def strip_all(input_string): """ Remove problem characters from filenames. Minimises annoying errors later on. Better safe than sorry!! """ new_string = input_string.replace(' ', '').replace('_', '').replace('-','').replace('(', '').replace(')', '').replace(',', '').replace("'", "").replace('&', '') # This expects at least 1 '.' to be present - i.e. the lowest count is 1 return new_string.replace('.','',(new_string.count('.'))-1) if __name__ == '__main__': if not os.path.isdir(TEST_DATA_DIR): log.error("The test data directory read from settings does not exist!:%s" % TEST_DATA_DIR ) sys.exit() else: os.chdir(TEST_DATA_DIR) execute_flac_convert() execute_mp3_convert()
def highlevel_mosaic(target, tcorpus, scorpus, scope=5): """ This will be used to test the highlevel mosaicing process. The scope variable controls the number of results which are returned for each target unit which is sought. """ # Create a temporary file for the mosaic audio filepath = os.path.join(os.getcwd(), 'temp_mosaic.wav') if os.path.isfile(filepath): os.remove(filepath) mosaic = Mosaic(filepath) cost = RepeatUnitCost() context = Context() gridder = Gridder() units = tcorpus.list_audio_units(audio_filename=target, chop='highlevel') hdb = scorpus.get_gaia_unit_db(chop='highlevel_%s' % self.chop) distance = get_mood_distance(hdb) v = View(hdb, distance) results = {} for f in units: p = Point() p.load(switch_ext(f, '.yaml')) unit_name = switch_ext(os.path.basename(f), '') p.setName(unit_name) p_m = hdb.history().mapPoint(p) results.update({f:v.nnSearch(p_m).get(scope)}) log.debug("Ok, now we have a dict with each target segment, along with its corresponding nearest matches in source db") log.debug("Check to see that we have every second of target audio accounted for - I think not!") #return results #new_results = results.copy() ds = DataSet() for r in results: units = [] for u in results[r]: ds.load(switch_ext(u[0], '.db')) for n in ds.pointNames(): units.append(n) new_ds = gaia_transform(dict(zip(units, units))) results.update({r:new_ds}) #return results # Very important - target units must be in correct order index = 0 index_skip = 0 for r in sorted(results.keys()): tds = DataSet() tds.load(switch_ext(r, '.db')) #return tds, results sds = results[r] source_set = set(sds.layout().descriptorNames()) target_set = set(tds.layout().descriptorNames()) remove_from_source = source_set.difference(target_set) remove_from_target = target_set.difference(source_set) if len(remove_from_source) > 0: log.debug("Will try to remove %s from the source DataSet" % remove_from_source) try: sds = transform(results[r], 'remove', {'descriptorNames':list(remove_from_source)}) except Exception, e: log.error("Failed to remove %s from source DataSet" % list(remove_from_source)) return results[r], tds if len(remove_from_target) > 0: log.debug("Will try to remove %s from the target DataSet" % remove_from_source) try: tds = transform(tds, 'remove', {'descriptorNames':list(remove_from_target)}) except Exception, e: log.error("Failed to remove %s from target DataSet" % list(remove_from_target)) return results[r], tds
sv = View(sds, get_low_level_distance(sds)) log.debug("Beginning to loop through units for this segment") #return tds, sds, gridder, sv for pname in sorted(tds.pointNames()): """ I don't think I need to map the point in this instance as we have already transformed the DataSet. try, analyse_corpus: p_m = sds.history().mapPoint(p) except Exception, e: log.error("Error mapping %s to %s: %s" % (p, sds, e)) return tds, p, sds """ print pname if os.path.basename(pname) != '%07d.yaml' % (index + index_skip): log.error("Current unit is %s => Missing a unit '%07d.yaml'- it must be silent... Index is %d, index skip is %d" % (pname, (index + index_skip), index, index_skip)) u = MosaicUnit(os.path.join(os.path.dirname(pname), '%07d.wav' % (index + index_skip))) u.silent = True mosaic.add_unit(u) index_skip += 1 p = Point() p.load(pname) p_m = sds.history().mapPoint(p) unit_results = sv.nnSearch(p_m).get(scope) log.debug("For %s, the closest matching points are: %s" % (pname, unit_results)) log.debug("Applying repition cost") unit_results = cost.get_results(unit_results) log.debug("Results are now: %s" % str(unit_results)) log.debug("Applying Context cost") unit_results = context.get_results(unit_results)