def train_chunks(self, pts, chunk_data, ref_freq, metric='pcd'): """------------------------------------------------------------------------- Gets the pitch track chunks of a recording, generates its pitch distribution and returns the PitchDistribution objects as a list. This function is called for each of the recordings in the training. The outputs of this function are combined in train() and the resultant mode model is obtained. ---------------------------------------------------------------------------- pts : List of pitch tracks of chunks that belong to the same mode. The pitch distributions of these are iteratively generated to use as the sample points of the mode model chunk_data : The relevant data about the chunks; source, initial timestamp and final timestamp. The format is the same as slice() of ModeFunctions. ref_freq : Reference frequency to be used in PD/PCD generation. Since this the training function, this should be the annotated tonic of the recording metric : The choice of PCD or PD -------------------------------------------------------------------------""" dist_list = [] # Iterates over the pitch tracks of a recording for idx in range(len(pts)): # Retrieves the relevant information about the current chunk src = chunk_data[idx][0] interval = (chunk_data[idx][1], chunk_data[idx][2]) # PitchDistribution of the current chunk is generated dist = mf.generate_pd(pts[idx], ref_freq=ref_freq, smooth_factor=self.smooth_factor, step_size=self.step_size, source=src, segment=interval, overlap=self.overlap) if(metric=='pcd'): dist = mf.generate_pcd(dist) # The resultant pitch distributions are filled in the list to be returned dist_list.append(dist) return dist_list
def train(self, mode_name, pitch_files, tonic_freqs, metric='pcd', save_dir=''): """------------------------------------------------------------------------- For the mode trainings, the requirements are a set of recordings with annotated tonics for each mode under consideration. This function only expects the recordings' pitch tracks and corresponding tonics as lists. The two lists should be indexed in parallel, so the tonic of ith pitch track in the pitch track list should be the ith element of tonic list. Once training is completed for a mode, the model wouldbe generated as a PitchDistribution object and saved in a JSON file. For loading these objects and other relevant information about the data structure, see the PitchDistribution class. ---------------------------------------------------------------------------- mode_name : Name of the mode to be trained. This is only used for naming the resultant JSON file, in the form "mode_name.json" pitch_files : List of files with pitch tracks extracted from the recording (i.e. single-column files with frequencies) tonic_freqs : List of annotated tonic frequencies of recordings metric : Whether the model should be octave wrapped (Pitch Class Distribution: PCD) or not (Pitch Distribution: PD) save_dir : Where to save the resultant JSON files. -------------------------------------------------------------------------""" # To generate the model pitch distribution of a mode, pitch track of each # recording is iteratively converted to cents, according to their respective # annotated tonics. Then, these are appended to mode_track and a very long # pitch track is generated, as if it is a single very long recording. The # pitch distribution of this track is the mode's model distribution. # Normalize the pitch tracks of the mode wrt the tonic frequency and concatenate for pf, tonic in zip(pitch_files, tonic_freqs): pitch_track = np.loadtxt(pf) if pitch_track.ndim > 1: # assume the first col is time, the second is pitch and the rest is labels etc pitch_track = pitch_track[:,1] if self.chunk_size == 0: # use the complete pitch track mode_track = mF.hz_to_cent(pitch_track, ref_freq=tonic) else: # slice and used the start of the pitch track time_track = np.arange(0, self.frame_rate * len(pitch_track), self.frame_rate) pitch_track, segs = mF.slice(time_track, pitch_track, mode_name, self.chunk_size) mode_track = mF.hz_to_cent(pitch_track[0], ref_freq=tonic) seglen = 'all' if self.chunk_size == 0 else (segs[0][1], segs[0][2]) # generate the pitch distribution pitch_distrib = mF.generate_pd(mode_track, smooth_factor=self.smooth_factor, step_size=self.step_size, source=pitch_files, segment=seglen) if metric == 'pcd': # convert to pitch class distribution, if specified pitch_distrib = mF.generate_pcd(pitch_distrib) # save the model to a file, if requested if save_dir: if not os.path.exists(save_dir): os.makedirs(save_dir) pitch_distrib.save(mode_name + '.json', save_dir=save_dir) return pitch_distrib
def tonic_evaluate(self, mbid, estimated, annotated): est_cent = mf.hz_to_cent([estimated], annotated)[0] # octave wrapping cent_diff = est_cent % self.CENT_PER_OCTAVE # check if the tonic is found correct bool_tonic = min([cent_diff, self.CENT_PER_OCTAVE - cent_diff]) < self.tolerance # convert the cent difference to symbolic interval (P5, m3 etc.) for i in self.INTERVAL_SYMBOLS: if i[1] <= cent_diff < i[2]: interval = i[0] break elif cent_diff == 1200: interval = 'P1' break # if they are in the same octave the the estimated and octave-wrapped values should be the same (very close) same_octave = (est_cent - cent_diff < 0.001) return {'mbid': mbid, 'tonic_eval': bool_tonic, 'same_octave': same_octave, 'cent_diff': cent_diff, 'interval': interval, 'annotated_tonic': annotated, 'estimated_tonic': estimated}
def train(self, mode_name, pt_files, tonic_freqs, metric='pcd', save_dir=''): """------------------------------------------------------------------------- For the mode trainings, the requirements are a set of recordings with annotated tonics for each mode under consideration. This function only expects the recordings' pitch tracks and corresponding tonics as lists. The two lists should be indexed in parallel, so the tonic of ith pitch track in the pitch track list should be the ith element of tonic list. Each pitch track would be sliced into chunks of size chunk_size and their pitch distributions are generated. Then, each of such chunk distributions are appended to a list. This list represents the mode by sample points as much as the number of chunks. So, the result is a list of PitchDistribution objects, i.e. list of structured dictionaries and this is what is saved. ---------------------------------------------------------------------------- mode_name : Name of the mode to be trained. This is only used for naming the resultant JSON file, in the form "mode_name.json" pt_files : List of pitch tracks (i.e. 1-D list of frequencies) tonic_freqs : List of annotated tonics of recordings metric : Whether the model should be octave wrapped (Pitch Class Distribution: PCD) or not (Pitch Distribution: PD) save_dir : Where to save the resultant JSON files. -------------------------------------------------------------------------""" save_name = mode_name + '.json' pitch_distrib_list = [] # Each pitch track is iterated over and its pitch distribution is generated # individually, then appended to pitch_distrib_list. Notice that although we treat # each chunk individually, we use a single tonic annotation for each recording # so we assume that the tonic doesn't change throughout a recording. for pf, tonic in zip(pt_files, tonic_freqs): pitch_track = np.loadtxt(pf) if pitch_track.ndim > 1: # assume the first col is time, the second is pitch and the rest is labels etc pitch_track = pitch_track[:,1] time_track = np.arange(0, (self.frame_rate*len(pitch_track)), self.frame_rate) # Current pitch track is sliced into chunks. if self.chunk_size == 0: # no slicing pts = [pitch_track] chunk_data = [pf + '_all'] else: pts, chunk_data = mf.slice(time_track, pitch_track, pf, self.chunk_size, self.threshold, self.overlap) # Each chunk is converted to cents pts = [mf.hz_to_cent(k, ref_freq=tonic) for k in pts] # This is a wrapper function. It iteratively generates the distribution # for each chunk and return it as a list. After this point, we only # need to save it. God bless modular programming! temp_list = self.train_chunks(pts, chunk_data, tonic, metric) # The list is composed of lists of PitchDistributions. So, # each list in temp_list corresponds to a recording and each # PitchDistribution in that list belongs to a chunk. Since these # objects remember everything, we just flatten the list and make # life much easier. From now on, each chunk is treated as an individual # distribution, regardless of which recording it belongs to. for tmp in temp_list: pitch_distrib_list.append(tmp) # save the model to a file, if requested if save_dir: if not os.path.exists(save_dir): os.makedirs(save_dir) # Dump the list of dictionaries in a JSON file. dist_json = [{'bins':d.bins.tolist(), 'vals':d.vals.tolist(), 'kernel_width':d.kernel_width, 'source':d.source, 'ref_freq':d.ref_freq, 'segmentation':d.segmentation, 'overlap':d.overlap} for d in pitch_distrib_list] with open(os.path.join(save_dir, mode_name + '.json'), 'w') as f: json.dump(dist_json, f, indent=2) return pitch_distrib_list
def chunk_estimate(self, pitch_track, mode_names=[], mode_name='', mode_dir='./', est_tonic=True, est_mode=True, distance_method="euclidean", metric='pcd', ref_freq=440, min_cnt=3, equalSamplePerMode = False): """------------------------------------------------------------------------- This function is called by the wrapper estimate() function only. It gets a pitch track chunk, generates its pitch distribution and compares it with the chunk distributions of the candidate modes. Then, finds the min_cnt nearest neighbors and returns them to estimate(), where these are used to make an estimation about the overall recording. ---------------------------------------------------------------------------- pitch_track : Pitch track chunk of the input recording whose tonic and/or mode is to be estimated. This is only a 1-D list of frequency values. mode_dir : The directory where the mode models are stored. This is to load the annotated mode or the candidate mode. mode_names : Names of the candidate modes. These are used when loading the mode models. If the mode isn't estimated, this parameter isn't used and can be ignored. mode_name : Annotated mode of the recording. If it's not known and to be estimated, this parameter isn't used and can be ignored. est_tonic : Whether tonic is to be estimated or not. If this flag is False, ref_freq is treated as the annotated tonic. est_mode : Whether mode is to be estimated or not. If this flag is False, mode_name is treated as the annotated mode. distance_method : The choice of distance methods. See distance() in ModeFunctions for more information. metric : Whether the model should be octave wrapped (Pitch Class Distribution: PCD) or not (Pitch Distribution: PD) ref_freq : Annotated tonic of the recording. If it's unknown, we use an arbitrary value, so this can be ignored. min_cnt : The number of nearest neighbors of the current chunk to be returned. The details of this parameter and its implications are explained in the first lines of estimate(). -------------------------------------------------------------------------""" # Preliminaries before the estimations # Cent-to-Hz covnersion is done and pitch distributions are generated cent_track = mf.hz_to_cent(pitch_track, ref_freq) dist = mf.generate_pd(cent_track, ref_freq=ref_freq, smooth_factor=self.smooth_factor, step_size=self.step_size) dist = mf.generate_pcd(dist) if (metric=='pcd') else dist # The model mode distribution(s) are loaded. If the mode is annotated and tonic # is to be estimated, only the model of annotated mode is retrieved. mode_collections = [self.load_collection(mode, dist_dir=mode_dir) for mode in mode_names] if equalSamplePerMode: minSamp = min([len(n) for n in mode_collections]) for i, m in enumerate(mode_collections): mode_collections[i] = random.sample(m, minSamp) # cum_lens (cummulative lengths) keeps track of number of chunks retrieved from # each mode. So that we are able to find out which mode the best performed chunk # belongs to. cum_lens = np.cumsum([len(col) for col in mode_collections]) # load mode distribution mode_dists = [d for col in mode_collections for d in col] mode_dist = self.load_collection(mode_name, dist_dir=mode_dir) if (mode_name!='') else None #Initializations of possible output parameters tonic_list = [0 for x in range(min_cnt)] mode_list = ['' for x in range(min_cnt)] min_distance_list = np.zeros(min_cnt) # If tonic will be estimated, there are certain common preliminary steps, # regardless of the process being a joint estimation of a tonic estimation. if(est_tonic): if(metric=='pcd'): # This is a precaution step, just to be on the safe side. If there # happens to be a peak at the last (and first due to the circular nature # of PCD) sample, it is considered as two peaks, one at the end and # one at the beginning. To prevent this, we find the global minima # of the distribution and shift it to the beginning, i.e. make it the # new reference frequency. This new reference could have been any other # as long as there is no peak there, but minima is fairly easy to find. shift_factor = dist.vals.tolist().index(min(dist.vals)) dist = dist.shift(shift_factor) # anti-freq is the new reference frequency after shift, as mentioned # above. anti_freq = mf.cent_to_hz([dist.bins[shift_factor]], ref_freq=ref_freq)[0] # Peaks of the distribution are found and recorded. These will be treated # as tonic candidates. peak_idxs, peak_vals = dist.detect_peaks() elif(metric=='pd'): # Since PD isn't circular, the precaution in PCD is unnecessary here. # Peaks of the distribution are found and recorded. These will be treated # as tonic candidates. peak_idxs, peak_vals = dist.detect_peaks() # The number of samples to be shifted is the list [peak indices - zero bin] # origin is the bin with value zero and the shifting is done w.r.t. it. origin = np.where(dist.bins==0)[0][0] shift_idxs = [(idx - origin) for idx in peak_idxs] # Here the actual estimation steps begin #Joint Estimation ### TODO: The first steps of joint estimation are very similar for both Bozkurt and ### Chordia. We might squeeze them into a single function in ModeFunctions. if(est_tonic and est_mode): if(metric=='pcd'): # PCD doesn't require any prelimimary steps. Generates the distance matrix. # The rows are tonic candidates and columns are mode candidates. dist_mat = mf.generate_distance_matrix(dist, peak_idxs, mode_dists, method=distance_method) elif(metric=='pd'): # Since PD lengths aren't equal, zero padding is required and # tonic_estimate() of ModeFunctions just does that. It can handle only # a single column, so the columns of the matrix are iteratively generated dist_mat = np.zeros((len(shift_idxs), len(mode_dists))) for m in xrange(len(mode_dists)): dist_mat[:,m] = mf.tonic_estimate(dist, shift_idxs, mode_dists[m], distance_method=distance_method, metric=metric, step_size=self.step_size) # Distance matrix is ready now. Since we need to report min_cnt many # nearest neighbors, the loop is iterated min_cnt times and returns # one neighbor at each iteration, from closest to futher. When first # nearest neighbor is found it's changed to the worst, so in the # next iteration, the nearest would be the second nearest and so on. for r in xrange(min_cnt): # The minima of the distance matrix is found. This is to find # the current nearest neighbor chunk. min_row = np.where((dist_mat == np.amin(dist_mat)))[0][0] min_col = np.where((dist_mat == np.amin(dist_mat)))[1][0] # Due to the precaution step of PCD, the reference frequency is # changed. That's why it's treated differently than PD. Here, # the cent value of the tonic estimate is converted back to Hz. if(metric=='pcd'): tonic_list[r] = mf.cent_to_hz([dist.bins[peak_idxs[min_row]]], anti_freq)[0] elif(metric=='pd'): tonic_list[r] = mf.cent_to_hz([shift_idxs[min_row] * self.step_size], ref_freq)[0] # We have found out which chunk is our nearest now. Here, we find out # which mode it belongs to, from cum_lens. mode_list[r] = (mode_names[min(np.where((cum_lens > min_col))[0])], mode_dists[min_col].source[:-6]) # To observe how close these neighbors are, we report their distances. # This doesn't affect the computation at all and it's just for the # evaluating and understanding the behvaviour of the system. min_distance_list[r] = dist_mat[min_row][min_col] # The minimum value is replaced with a value larger than maximum, # so we can easily find the second nearest neighbor. dist_mat[min_row][min_col] = (np.amax(dist_mat) + 1) return [[mode_list, tonic_list], min_distance_list.tolist()] # Tonic Estimation elif(est_tonic): # This part assigns the special case changes to standard variables, # so that we can treat PD and PCD in the same way, as much as # possible. peak_idxs = shift_idxs if metric=='pd' else peak_idxs anti_freq = ref_freq if metric=='pd' else anti_freq # Distance matrix is generated. In the mode_estimate() function # of ModeFunctions, PD and PCD are treated differently and it # handles the special cases such as zero-padding. The mode is # already known, so there is only one mode collection, i.e. # set of chunk distributions that belong to the same mode, to # be compared. Each column is a chunk distribution and each # row is a tonic candidate. dist_mat = [mf.tonic_estimate(dist, peak_idxs, d, distance_method=distance_method, metric=metric, step_size=self.step_size) for d in mode_dist] # Distance matrix is ready now. Since we need to report min_cnt many # nearest neighbors, the loop is iterated min_cnt times and returns # one neighbor at each iteration, from closest to futher. When first # nearest neighbor is found it's changed to the worst, so in the # next iteration, the nearest would be the second nearest and so on. for r in xrange(min_cnt): # The minima of the distance matrix is found. This is to find # the current nearest neighbor chunk. min_row = np.where((dist_mat == np.amin(dist_mat)))[0][0] min_col = np.where((dist_mat == np.amin(dist_mat)))[1][0] # The corresponding tonic candidate is found, based on the # current nearest neighbor and it's distance is recorded tonic_list[r] = (mf.cent_to_hz([dist.bins[peak_idxs[min_col]]], anti_freq)[0], mode_dist[min_row].source[:-6]) min_distance_list[r] = dist_mat[min_row][min_col] # The minimum value is replaced with a value larger than maximum, # so we can easily find the second nearest neighbor. dist_mat[min_row][min_col] = (np.amax(dist_mat) + 1) return [tonic_list, min_distance_list.tolist()] # Mode estimation elif(est_mode): # Only in mode estimation, the distance matrix is actually a vector. # Since the tonic is annotated, the distribution isn't shifted and # compared to each chunk distribution of each candidate mode. # Again, mode_estimate() of ModeFunctions handles the different # approach required for PCD and PD. distance_vector = mf.mode_estimate(dist, mode_dists, distance_method=distance_method, metric=metric, step_size=self.step_size) # Distance matrix is ready now. Since we need to report min_cnt many # nearest neighbors, the loop is iterated min_cnt times and returns # one neighbor at each iteration, from closest to futher. When first # nearest neighbor is found it's changed to the worst, so in the # next iteration, the nearest would be the second nearest and so on. for r in xrange(min_cnt): # The minima of the distance matrix is found. This is to find # the current nearest neighbor chunk. idx = np.argmin(distance_vector) # We have found out which chunk is our nearest now. Here, we find out # which mode it belongs to, from cum_lens. mode_list[r] = (mode_names[min(np.where((cum_lens > idx))[0])], mode_dists[idx].source[:-6]) # The distance of the current nearest neighbors recorded. The details # of this step is explained in the end of the analogous loop in joint # estimation of thşs function. min_distance_list[r] = distance_vector[idx] # The minimum value is replaced with a value larger than maximum, # so we can easily find the second nearest neighbor. distance_vector[idx] = (np.amax(distance_vector) + 1) return [mode_list, min_distance_list.tolist()] else: return 0
def estimate(self, pitch_file, mode_names=[], mode_name='', mode_dir='./', est_mode=True, distance_method="euclidean", metric='pcd', tonic_freq=None, k_param=1, equalSamplePerMode = False): """------------------------------------------------------------------------- In the estimation phase, the input pitch track is sliced into chunk and each chunk is compared with each candidate mode's each sample model, i.e. with the distributions of each training recording's each chunk. This function is a wrapper, that handles decision making portion and the overall flow of the estimation process. Internally, segment estimate is called for generation of distance matrices and detecting neighbor distributions. 1) Joint Estimation: Neither the tonic nor the mode of the recording is known. Then, joint estimation estimates both of these parameters without any prior knowledge about the recording. To use this: est_mode and est_tonic flags should be True since both are to be estimated. In this case tonic_freq and mode_name parameters are not used, since these are used to pass the annotated data about the recording. 2) Tonic Estimation: The mode of the recording is known and tonic is to be estimated. This is generally the most accurate estimation among the three. To use this: est_tonic should be True and est_mode should be False. In this case tonic_freq and mode_names parameters are not used since tonic isn't known a priori and mode is known and hence there is no candidate mode. 3) Mode Estimation: The tonic of the recording is known and mode is to be estimated. To use this: est_mode should be True and est_tonic should be False. In this case mode_name parameter isn't used since the mode annotation is not available. It can be ignored. ---------------------------------------------------------------------------- pitch_file : File in which the pitch track of the input recording whose tonic and/or mode is to be estimated. mode_dir : The directory where the mode models are stored. This is to load the annotated mode or the candidate mode. mode_names : Names of the candidate modes. These are used when loading the mode models. If the mode isn't estimated, this parameter isn't used and can be ignored. mode_name : Annotated mode of the recording. If it's not known and to be estimated, this parameter isn't used and can be ignored. est_tonic : Whether tonic is to be estimated or not. If this flag is False, tonic_freq is treated as the annotated tonic. est_mode : Whether mode is to be estimated or not. If this flag is False, mode_name is treated as the annotated mode. k_param : The k parameter of K Nearest Neighbors. distance_method : The choice of distance methods. See distance() in ModeFunctions for more information. metric : Whether the model should be octave wrapped (Pitch Class Distribution: PCD) or not (Pitch Distribution: PD) tonic_freq : Annotated tonic of the recording. If it's unknown, we use an arbitrary value, so this can be ignored. -------------------------------------------------------------------------""" # load pitch track pitch_track = np.loadtxt(pitch_file) # assume the first col is time, the second is pitch and the rest is labels etc. pitch_track = pitch_track[:,1] if pitch_track.ndim > 1 else pitch_track # Pitch track is sliced into chunks. time_track = np.arange(0, (self.frame_rate*len(pitch_track)), self.frame_rate) if self.chunk_size == 0: # no slicing pts = [pitch_track] chunk_data = ['input_all'] else: pts, chunk_data = mf.slice(time_track, pitch_track, 'input', self.chunk_size, self.threshold, self.overlap) # Here's a neat trick. In order to return an estimation about the entire # recording based on our observations on individual chunks, we look at the # nearest neighbors of union of all chunks. We are returning min_cnt # many number of closest neighbors from each chunk. To make sure that we # capture all of the nearest neighbors, we return a little more than # required and then treat the union of these nearest neighbors as if it's # the distance matrix of the entire recording.Then, we find the nearest # neighbors from the union of these from each chunk. This is quite an # overshoot, we only need min_cnt >= k_param. ### TODO: shrink this value as much as possible. min_cnt = len(pts) * k_param #Initializations tonic_list = 0 mode_list = '' # parse tonic input if tonic_freq: # tonic is already known; est_tonic = False else: est_tonic = True # take A4 as the dummy frequency value for cent conversion tonic_freq = 440 if not (est_tonic or est_mode): print "Both tonic and mode are known!" return -1 if(est_tonic and est_mode): neighbors = [ [mode_list, tonic_list] for i in range(len(chunk_data)) ] elif(est_tonic): neighbors = [ tonic_list for i in range(len(chunk_data)) ] elif(est_mode): neighbors = [ mode_list for i in range(len(chunk_data)) ] # chunk_estimate() generates the distributions of each chunk iteratively, # then compares it with all candidates and returns min_cnt closest neighbors # of each chunk to neighbors list. for p in range(len(pts)): neighbors[p] = self.chunk_estimate(pts[p], mode_names=mode_names, mode_name=mode_name, mode_dir=mode_dir, est_tonic=est_tonic, est_mode=est_mode, distance_method=distance_method, metric=metric, ref_freq=tonic_freq, min_cnt=min_cnt, equalSamplePerMode = equalSamplePerMode) ### TODO: Clean up the spaghetti decision making part. The procedures ### are quite repetitive. Wrap them up with a separate function. # Temporary variables used during the desicion making part. candidate_distances, candidate_ests, candidate_sources, kn_distances, kn_ests, \ kn_sources, idx_counts, elem_counts, res_distances, res_sources = ([] for i in range(10)) # Joint estimation decision making. if(est_mode and est_tonic): # Flattens the returned candidates and related data about them and # stores them into candidate_* variables. candidate_distances stores # the distance values, candidate_ests stores the mode/tonic pairs # candidate_sources stores the sources of the nearest neighbors. for i in xrange(len(pts)): for j in neighbors[i][1]: candidate_distances.append(j) for l in xrange(len(neighbors[i][0][1])): candidate_ests.append((neighbors[i][0][1][l], neighbors[i][0][0][l][0])) candidate_sources.append(neighbors[i][0][0][l][1]) # Finds the nearest neighbors and fills all related data about # them to kn_* variables. Each of these variables have length k. # kn_distances stores the distance values, kn_ests stores # mode/tonic pairs, kn_sources store the name/id of the distribution # that gave rise to the corresponding distances. for k in xrange(k_param): idx = np.argmin(candidate_distances) kn_distances.append(candidate_distances[idx]) kn_ests.append(candidate_ests[idx]) kn_sources.append(candidate_sources[idx]) candidate_distances[idx] = (np.amax(candidate_distances) + 1) # Counts the occurences of each candidate mode/tonic pair in # the K nearest neighbors. The result is our estimation. for c in set(kn_ests): idx_counts.append(kn_ests.count(c)) elem_counts.append(c) joint_estimation = elem_counts[np.argmax(idx_counts)] # We have concluded our estimation. Here, we retrieve the # relevant data to this estimation; the sources and coresponding # distances. for m in xrange(len(kn_ests)): if (kn_ests[m] == joint_estimation): res_sources.append(kn_sources[m]) res_distances.append(kn_distances[m]) result = [joint_estimation, res_sources, res_distances] # Mode estimation decision making elif(est_mode): # Flattens the returned candidates and related data about them and # stores them into candidate_* variables. candidate_distances stores # the distance values, candidate_ests stores the candidate modes # candidate_sources stores the sources of the nearest neighbors. for i in xrange(len(pts)): for j in neighbors[i][1]: candidate_distances.append(j) for l in xrange(len(neighbors[i][0])): candidate_ests.append(neighbors[i][0][l][0]) candidate_sources.append(neighbors[i][0][l][1]) # Finds the nearest neighbors and fills all related data about # them to kn_* variables. Each of these variables have length k. # kn_distances stores the distance values, kn_ests stores # mode names, kn_sources store the name/id of the distributions # that gave rise to the corresponding distances. for k in xrange(k_param): idx = np.argmin(candidate_distances) kn_distances.append(candidate_distances[idx]) kn_ests.append(candidate_ests[idx]) kn_sources.append(candidate_sources[idx]) candidate_distances[idx] = (np.amax(candidate_distances) + 1) # Counts the occurences of each candidate mode name in # the K nearest neighbors. The result is our estimation. for c in set(kn_ests): idx_counts.append(kn_ests.count(c)) elem_counts.append(c) mode_estimation = elem_counts[np.argmax(idx_counts)] # We have concluded our estimation. Here, we retrieve the # relevant data to this estimation; the sources and coresponding # distances. for m in xrange(len(kn_ests)): if (kn_ests[m] == mode_estimation): res_sources.append(kn_sources[m]) res_distances.append(kn_distances[m]) result = [mode_estimation, res_sources, res_distances] # Tonic estimation decision making elif(est_tonic): # Flattens the returned candidates and related data about them and # stores them into candidate_* variables. candidate_distances stores # the distance values, candidate_ests stores the candidate peak # frequencies, candidate_sources stores the sources of the nearest # neighbors. for i in xrange(len(pts)): for j in neighbors[i][1]: candidate_distances.append(j) for l in xrange(len(neighbors[i][0])): candidate_ests.append(neighbors[i][0][l][0]) candidate_sources.append(neighbors[i][0][l][1]) # Finds the nearest neighbors and fills all related data about # them to kn_* variables. Each of these variables have length k. # kn_distances stores the distance values, kn_ests stores # peak frequencies, kn_sources store the name/id of the # distributions that gave rise to the corresponding distances. for k in xrange(k_param): idx = np.argmin(candidate_distances) kn_distances.append(candidate_distances[idx]) kn_ests.append(candidate_ests[idx]) kn_sources.append(candidate_sources[idx]) candidate_distances[idx] = (np.amax(candidate_distances) + 1) # Counts the occurences of each candidate tonic frequency in # the K nearest neighbors. The result is our estimation. for c in set(kn_ests): idx_counts.append(kn_ests.count(c)) elem_counts.append(c) tonic_estimation = elem_counts[np.argmax(idx_counts)] # We have concluded our estimation. Here, we retrieve the # relevant data to this estimation; the sources and coresponding # distances. for m in xrange(len(kn_ests)): if (kn_ests[m] == tonic_estimation): res_sources.append(kn_sources[m]) res_distances.append(kn_distances[m]) result = [tonic_estimation, res_sources, res_distances] return result
def estimate(self, pitch_file, mode_in='./', tonic_freq=None, rank=1, distance_method="bhat", metric='pcd'): """------------------------------------------------------------------------- This is the ultimate estimation function. There are three different types of estimations. 1) Joint Estimation: Neither the tonic nor the mode of the recording is known. Then, joint estimation estimates both of these parameters without any prior knowledge about the recording. To use this: est_mode and est_tonic flags should be True since both are to be estimated. In this case tonic_freq and mode_name parameters are not used, since these are used to pass the annotated data about the recording. 2) Tonic Estimation: The mode of the recording is known and tonic is to be estimated. This is generally the most accurate estimation among the three. To use this: est_tonic should be True and est_mode should be False. In this case tonic_freq and mode_names parameters are not used since tonic isn't known a priori and mode is known and hence there is no candidate mode. 3) Mode Estimation: The tonic of the recording is known and mode is to be estimated. To use this: est_mode should be True and est_tonic should be False. In this case mode_name parameter isn't used since the mode annotation is not available. It can be ignored. ---------------------------------------------------------------------------- pitch_file: : File in which the pitch track of the input recording whose tonic and/or mode is to be estimated. mode_in : The mode input, If it is a filename or distribution object, the mode is treated as known and only tonic will be estimated. If a directory with the json files or dictionary of distributions (per mode) is given, the mode will be estimated. In case of directory, the modes will be taken as the json filenames. tonic_freq : Annotated tonic of the recording. If it's unknown, we use an arbitrary value, so this can be ignored. rank : The number of estimations expected from the system. If this is 1, estimation returns the most likely tonic, mode or tonic/mode pair. If it is n, it returns a sorted list of tuples of length n, each containing a tonic/mode pair. distance_method : The choice of distance methods. See distance() in ModeFunctions for more information. metric : Whether the model should be octave wrapped (Pitch Class Distribution: PCD) or not (Pitch Distribution: PD) -------------------------------------------------------------------------""" # load pitch track pitch_track = np.loadtxt(pitch_file) # assume the first col is time, the second is pitch and the rest is labels etc. pitch_track = pitch_track[:,1] if pitch_track.ndim > 1 else pitch_track # parse mode input try: # list of json files per mode if all(os.path.isfile(m) for m in mode_in): est_mode = True # do mode estimation mode_names = [os.path.splitext(m)[0] for m in mode_in] models = [pD.load(m) for m in mode_in] elif os.path.isfile(mode_in): # json file est_mode = False # mode already known model = pD.load(mode_in) except TypeError: try: # models if isinstance(mode_in, pD.PitchDistribution): # mode is loaded est_mode = False # mode already known model = mode_in elif all(isinstance(m, pD.PitchDistribution) for m in mode_in.values()): # models of all modes are loaded est_mode = True # do mode estimation mode_names = mode_in.keys() models = [mode_in[m] for m in mode_names] except: ValueError("Unknown mode input!") # parse tonic input if tonic_freq: # tonic is already known; est_tonic = False else: est_tonic = True tonic_freq = 440 # take A4 as the dummy frequency value for cent conversion; it doesnt affect anything if not (est_tonic or est_mode): ValueError("Both tonic and mode are known!") # slice the pitch track if specified if self.chunk_size > 0: time_track = np.arange(0, self.frame_rate * len(pitch_track), self.frame_rate) pitch_track, segs = mF.slice(time_track, pitch_track, '', self.chunk_size) # normalize pitch track according to the given tonic frequency cent_track = mF.hz_to_cent(pitch_track, ref_freq=tonic_freq) # Pitch distribution of the input recording is generated distrib = mF.generate_pd(cent_track, ref_freq=tonic_freq, smooth_factor=self.smooth_factor, step_size=self.step_size) # convert to PCD, if specified distrib = mF.generate_pcd(distrib) if (metric == 'pcd') else distrib # Saved mode models are loaded and output variables are initiated tonic_ranked = [('', 0) for x in range(rank)] mode_ranked = [('', 0) for x in range(rank)] # Preliminary steps for tonic identification if est_tonic: if metric == 'pcd': # If there happens to be a peak at the last (and first due to the circular # nature of PCD) sample, it is considered as two peaks, one at the end and # one at the beginning. To prevent this, we find the global minima (as it # is easy to compute) of the distribution and make it the new reference # frequency, i.e. shift it to the beginning. shift_factor = distrib.vals.tolist().index(min(distrib.vals)) distrib = distrib.shift(shift_factor) # update to the new reference frequency after shift tonic_freq = mF.cent_to_hz([distrib.bins[shift_factor]], ref_freq=tonic_freq)[0] # Find the peaks of the distribution. These are the tonic candidates. peak_idxs, peak_vals = distrib.detect_peaks() elif metric == 'pD': # Find the peaks of the distribution. These are the tonic candidates peak_idxs, peak_vals = distrib.detect_peaks() # The number of samples to be shifted is the list [peak indices - zero bin] # origin is the bin with value zero and the shifting is done w.r.t. it. origin = np.where(distrib.bins == 0)[0][0] shift_idxs = [(idx - origin) for idx in peak_idxs] # Joint Estimation if (est_tonic and est_mode): if (metric == 'pD'): # Since PD lengths aren't equal, we zero-pad the distributions for comparison # tonic_estimate() of ModeFunctions just does that. It can handle only # a single column, so the columns of the matrix are iteratively generated dist_mat = np.zeros((len(shift_idxs), len(models))) for m, model in enumerate(models): dist_mat[:, m] = mF.tonic_estimate(distrib, shift_idxs, model, distance_method=distance_method, metric=metric, step_size=self.step_size) elif (metric == 'pcd'): # PCD doesn't require any preliminary steps. Generate the distance matrix. # The rows are tonic candidates and columns are mode candidates. dist_mat = mF.generate_distance_matrix(distrib, peak_idxs, models, method=distance_method) # Distance matrix is ready now. For each rank, (or each pair of # tonic-mode estimate pair) the loop is iterated. When the first # best estimate is found it's changed to the worst, so in the # next iteration, the estimate would be the second best and so on. for r in range(min(rank, len(peak_idxs))): # The minima of the distance matrix is found. This is when the # distribution is the most similar to a mode distribution, according # to the corresponding tonic estimate. The corresponding tonic # and mode pair is our current estimate. min_row = np.where((dist_mat == np.amin(dist_mat)))[0][0] min_col = np.where((dist_mat == np.amin(dist_mat)))[1][0] # Due to the precaution step of PCD, the reference frequency is # changed. That's why it's treated differently than PD. Here, # the cent value of the tonic estimate is converted back to Hz. if (metric == 'pcd'): tonic_ranked[r] = (mF.cent_to_hz([distrib.bins[peak_idxs[min_row]]], tonic_freq)[0], dist_mat[min_row][min_col]) elif (metric == 'pD'): tonic_ranked[r] = (mF.cent_to_hz([shift_idxs[min_row] * self.step_size], tonic_freq)[0], dist_mat[min_row][min_col]) # Current mode estimate is recorded. mode_ranked[r] = (mode_names[min_col], dist_mat[min_row][min_col]) # The minimum value is replaced with a value larger than maximum, # so we won't return this estimate pair twice. dist_mat[min_row][min_col] = (np.amax(dist_mat) + 1) return mode_ranked, tonic_ranked # Tonic Estimation elif (est_tonic): # This part assigns the special case changes to standard variables, # so that we can treat PD and PCD in the same way, as much as # possible. peak_idxs = shift_idxs if (metric == 'pD') else peak_idxs tonic_freq = tonic_freq if (metric == 'pcd') else tonic_freq # Distance vector is generated. In the mode_estimate() function # of ModeFunctions, PD and PCD are treated differently and it # handles the special cases such as zero-padding. The mode is # already known, so there is only one model to be compared. Each # entry corresponds to one tonic candidate. distance_vector = mF.tonic_estimate(distrib, peak_idxs, model, distance_method=distance_method, metric=metric, step_size=self.step_size) # Distance vector is ready now. For each rank, the loop is iterated. # When the first best estimate is found it's changed to be the worst, # so in the next iteration, the estimate would be the second best # and so on for r in range(min(rank, len(peak_idxs))): # Minima is found, corresponding tonic candidate is our current # tonic estimate idx = np.argmin(distance_vector) # Due to the changed reference frequency in PCD's precaution step, # PCD and PD are treated differently here. # TODO: review here, this might be tedious due to 257th line. if (metric == 'pcd'): tonic_ranked[r] = (mF.cent_to_hz([distrib.bins[peak_idxs[idx]]], tonic_freq)[0], distance_vector[idx]) elif (metric == 'pD'): tonic_ranked[r] = (mF.cent_to_hz([shift_idxs[idx] * self.step_size], tonic_freq)[0], distance_vector[idx]) # Current minima is replaced with a value larger than maxima, # so that we won't return the same estimate twice. distance_vector[idx] = (np.amax(distance_vector) + 1) return tonic_ranked # Mode Estimation elif (est_mode): # Distance vector is generated. Again, mode_estimate() of # ModeFunctions handles the different approach required for # PCD and PD. Since tonic is known, the distributions aren't # shifted and are only compared to candidate mode models. distance_vector = mF.mode_estimate(distrib, models, distance_method=distance_method, metric=metric, step_size=self.step_size) # Distance vector is ready now. For each rank, the loop is iterated. # When the first best estimate is found it's changed to be the worst, # so in the next iteration, the estimate would be the second best # and so on for r in range(min(rank, len(mode_names))): # Minima is found, corresponding mode candidate is our current # mode estimate idx = np.argmin(distance_vector) mode_ranked[r] = (mode_names[idx], distance_vector[idx]) # Current minima is replaced with a value larger than maxima, # so that we won't return the same estimate twice. distance_vector[idx] = (np.amax(distance_vector) + 1) return mode_ranked else: # Nothing is expected to be estimated. return 0