/
monophonic_tonality_sorter.py
153 lines (144 loc) · 6.73 KB
/
monophonic_tonality_sorter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# sortFiles.py
# Given a directory of snippets of files, sort them according to their note and monophonic tonality
#
# Colin Raffel, 2012
import utility
import sys
import os
import shutil
import numpy as np
PITCH_TOLERANCE=.05
PITCHES_TO_RUN=6
class MonophonicTonalitySorter:
def __init__( self, snippetDirectory, destinationDirectory, fs = 44100, nNotes = 72, baseNote = 24 ):
# Store input params
self.snippetDirectory = snippetDirectory
self.destinationDirectory = destinationDirectory
self.fs = fs
self.nNotes = nNotes
self.baseNote = baseNote
# Create the destination dir if it doesn't exist
if not os.path.exists( self.destinationDirectory ):
os.makedirs( self.destinationDirectory )
# Sort the files and copy them out
self.sortFiles()
def sortFiles( self ):
# Get list of wav files, recursively
fileList = utility.getFiles( self.snippetDirectory, '.wav' )
# Array of per-file, per-note tonalities. Each file gets a score for each pitch
tonalities = np.zeros( ( len( fileList ), self.nNotes ) )
print "Getting tonalities..."
for n in np.arange( len( fileList ) ):
# Get audio data
audioData, fs = utility.getWavData( fileList[n] )
# Make sure the sampling rate matches...
assert fs == self.fs
# Get the tonality scores for all notes for this file
tonalities[n] = self.getTonality( audioData )
# Hash for the fundamental frequencies of each file
fileFrequencies = {}
# Find the best file for each note
print "Finding best candidates for notes"
for n in np.arange( self.nNotes ):
# Get the tonality scores
tonalitiesForThisNote = tonalities[:,n]
# Get the sorted indices of tonality scores
tonalitiesSort = np.argsort( tonalitiesForThisNote )[::-1]
# Keep track of which sorted array index we're getting the frequency for
sortedIndex = 0
# What frequency is the note we're looking for?
targetHz = utility.midiToHz( self.baseNote + n )
# What's the detected Hz of the note?
detectedHz = 1.0
# Until we find an audio file whose YIN detected pitch is sufficiently close
while ((targetHz/detectedHz) < (1 - PITCH_TOLERANCE) or (targetHz/detectedHz) > (1 + PITCH_TOLERANCE)) and sortedIndex < tonalitiesSort.shape[0] and sortedIndex < PITCHES_TO_RUN:
# If this file has not been YIN analyzed yet, analyze it
if not fileFrequencies.has_key( tonalitiesSort[sortedIndex] ):
audioData, fs = utility.getWavData( fileList[tonalitiesSort[sortedIndex]] )
# ... and store it so that you don't have to calculate it next time
fileFrequencies[tonalitiesSort[sortedIndex]] = self.yinPitchDetect( audioData )
detectedHz = fileFrequencies[tonalitiesSort[sortedIndex]]
# Check the next file next time
sortedIndex += 1
# If we didn't run out of files, copy out the file that we found (with the closest pitch)
if sortedIndex < len( fileList ):
shutil.copy( fileList[tonalitiesSort[sortedIndex-1]], os.path.join( self.destinationDirectory, str(n + self.baseNote) + ".wav" ) )
else:
shutil.copy( fileList[tonalitiesSort[0]], os.path.join( self.destinationDirectory, str(n + self.baseNote) + ".wav" ) )
# Get the tonality score for some audio data
def getTonality( self, audioData ):
# Number of samples in the audio data
N = audioData.shape[0]
# Tonality scores for each note
tonalityScores = np.zeros( self.nNotes )
# Get magnitude spectrum for this audio data
AudioData = np.abs( np.fft.rfft( audioData ) )
# Calculate spectral crest factor and RMS... not sure if we should use these
spectralCrestFactor = np.max( AudioData )/np.mean( AudioData )
RMS = np.sqrt( np.sum( audioData**2.0 ) )/(N*1.0)
# For each note
for note in np.arange( self.nNotes ):
# Create a mask for only bins which are harmonics of the note in question
mask = self.createMask( N, utility.midiToHz( self.baseNote + note ) )
# Calculate "monophonic tonality" - max of bins corresponding to harmonics/spectral mean
monophonicTonality = np.max( AudioData[mask == 1] )/np.mean( AudioData[mask == 0] )
# Write out tonality score
tonalityScores[note] = monophonicTonality*RMS
return tonalityScores
# Create an array of 1s and 0s with 1s at harmonic multiples of the base frequency
def createMask( self, N, baseFrequency ):
# Number of bins in the mask
nBins = N/2 - 1
# Maximum frequency that we want to create a mask for
maxFrequency = utility.midiToHz( self.baseNote + self.nNotes + 24 )
# Make sure it's not out of range... above nyquist or so
if maxFrequency > (self.fs*.9)/2.0:
maxFrequency = (self.fs*.9)/2.0
# Create the mask
mask = np.zeros( nBins )
# Over all harmonic frequencies
for harmonic in baseFrequency*(2**np.arange( 4 )):
if harmonic > maxFrequency:
break
# Set bins near this harmonic to 1
mask[utility.hzToBins( harmonic, N, self.fs )] = 1
return mask
# Yin pitch detector. Returns f0 of input frame.
def yinPitchDetect( self, frame, threshold=0.15, W=None ):
# Assume window size = frame size/2
if not W:
W = np.floor(frame.shape[0]/2.0)
# Number of lags possible
nLags = frame.shape[0] - W
# Pre-allocate squared difference
squaredDifference = np.zeros( nLags )
# Calculate squared difference for all lags
for tau in np.arange( nLags ):
squaredDifference[tau] = np.sum( (frame[:W] - frame[tau:W+tau])**2.0 )
# Calculate the "cumulative-mean-normalized" square difference
squaredDifferenceNormalized = np.zeros(nLags)
squaredDifferenceNormalized[0] = 1.0
for tau in np.arange( 1, nLags ):
squaredDifferenceNormalized[tau] = squaredDifference[tau]/(np.sum( squaredDifference[1:tau] )/tau)
# Find first local minima which is below the threshold
f0 = None
for n in np.arange( 1, nLags-1 ):
if squaredDifferenceNormalized[n-1] > squaredDifferenceNormalized[n] \
and squaredDifferenceNormalized[n+1] > squaredDifferenceNormalized[n] \
and squaredDifferenceNormalized[n] < threshold:
f0 = n
break
# No local minima found below threshold
if not f0:
f0 = np.argmin( squaredDifferenceNormalized[1:-1] )
# Parabolic interpolation
peakOffset = (squaredDifferenceNormalized[f0+1] - squaredDifferenceNormalized[f0-1])\
/(2.0*(2.0*squaredDifferenceNormalized[f0] - squaredDifferenceNormalized[f0+1] - squaredDifferenceNormalized[f0-1]))
f0 = f0 + peakOffset
f0 = self.fs/f0
return f0
if __name__ == "__main__":
if len(sys.argv) < 2:
print "Usage: %s snippetDirectory outputDirectory" % sys.argv[0]
sys.exit(-1)
MonophonicTonalitySorter( sys.argv[1], sys.argv[2] )