def __data_generation_mel(self, list_IDs_temp): 'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels) # Initialization X = np.empty(( self.batch_size, self.n_channels, *self.dim, )) y = np.empty((self.batch_size), dtype=int) # Generate data for i, ID in enumerate(list_IDs_temp): # Store sample if self.labels[ID] == 1: X[i, ] = ap.compute_melgram(self.location + '/right/' + ID) else: X[i, ] = ap.compute_melgram(self.location + '/nonright/' + ID) #X[i,] = np.load('data/' + ID + '.npy') # Store class y[i] = self.labels[ID] X = X.reshape((self.batch_size, self.dim[0], self.dim[1], 1)) return X, keras.utils.to_categorical(y, num_classes=self.n_classes)
def extract_melgrams(list_path, MULTIFRAMES, process_all_song, num_songs_genre): melgrams = np.zeros((0, 1, 96, 1366), dtype=np.float32) song_paths = open(list_path, 'r').read().splitlines() labels = list() num_frames_total = list() for song_ind, song_path in enumerate(song_paths): print(song_path) if MULTIFRAMES: melgram = ap.compute_melgram_multiframe(song_path, process_all_song) num_frames = melgram.shape[0] num_frames_total.append(num_frames) print('num frames:', num_frames) if num_songs_genre != '': index = int(floor(song_ind / num_songs_genre)) for i in range(0, num_frames): labels.append(index) else: pass else: melgram = ap.compute_melgram(song_path) melgrams = np.concatenate((melgrams, melgram), axis=0) if num_songs_genre != '': return melgrams, labels, num_frames_total else: return melgrams, num_frames_total
def main(net): print('Running main() with network: %s' % net) # setting audio_paths = [ 'data/bensound-cute.mp3', 'data/bensound-actionable.mp3', 'data/bensound-dubstep.mp3', 'data/bensound-thejazzpiano.mp3' ] melgram_paths = [ 'data/bensound-cute.npy', 'data/bensound-actionable.npy', 'data/bensound-dubstep.npy', 'data/bensound-thejazzpiano.npy' ] tags = [ 'rock', 'pop', 'alternative', 'indie', 'electronic', 'female vocalists', 'dance', '00s', 'alternative rock', 'jazz', 'beautiful', 'metal', 'chillout', 'male vocalists', 'classic rock', 'soul', 'indie rock', 'Mellow', 'electronica', '80s', 'folk', '90s', 'chill', 'instrumental', 'punk', 'oldies', 'blues', 'hard rock', 'ambient', 'acoustic', 'experimental', 'female vocalist', 'guitar', 'Hip-Hop', '70s', 'party', 'country', 'easy listening', 'sexy', 'catchy', 'funk', 'electro', 'heavy metal', 'Progressive rock', '60s', 'rnb', 'indie pop', 'sad', 'House', 'happy' ] # prepare data like this melgrams = np.zeros((0, 1, 96, 1366)) if librosa_exists: for audio_path in audio_paths: melgram = ap.compute_melgram(audio_path) melgrams = np.concatenate((melgrams, melgram), axis=0) else: for melgram_path in melgram_paths: melgram = np.load(melgram_path) melgrams = np.concatenate((melgrams, melgram), axis=0) # load model like this if net == 'cnn': model = convnet.build_convnet_model() elif net == 'rnn': model = recurrentnet.build_recurrentnet_model() model.summary() print('Loading weights of %s...' % net) model.load_weights('data/%s_weights_best.hdf5' % net) # predict the tags like this print('Predicting...') start = time.time() pred_tags = model.predict(melgrams) # print like this... print "Prediction is done. It took %d seconds." % (time.time() - start) print('Printing top-15 tags for each track...') for song_idx, audio_path in enumerate(audio_paths): sorted_result = sort_result(tags, pred_tags[song_idx, :].tolist()) print(audio_path) print(sorted_result[:5]) print(sorted_result[5:10]) print(sorted_result[10:15]) print(' ') return
def load_mel_examples(limit=10, fname="./data/clip_info_final.csv", num_rows=96, num_cols=1366): df = pd.read_csv(fname, delimiter="\t") print(df.info()) mp3_paths = df["mp3_path"][:limit] examples = np.empty((0, 1, num_rows, num_cols), dtype=float) for i, p in enumerate(mp3_paths): if p is np.NaN: x = np.zeros((1, 1, num_rows, num_cols), dtype=float) else: print(p) path = "./data/mp3/%s" % (p) x = ap.compute_melgram(path) examples = np.append(examples, x, axis=0) """ examples = np.empty((0, num_rows, num_cols), dtype=float) for i, p in enumerate(mp3_paths): if p is np.NaN: x = np.zeros((num_rows, num_cols), dtype=float) else: print(p) path="./data/mp3/%s.mels.tsv" %(p) df2 = pd.read_csv(path, delimiter="\t") x = df2.as_matrix() x = x[np.newaxis, :] examples = np.append(examples, x, axis=0) """ return examples
def main(net): print('Running main() with network: %s and backend: %s' % (net, K._BACKEND)) # setting audio_paths = [ 'data/bensound-cute.mp3', 'data/bensound-actionable.mp3', 'data/bensound-dubstep.mp3', 'data/bensound-thejazzpiano.mp3' ] melgram_paths = [ 'data/bensound-cute.npy', 'data/bensound-actionable.npy', 'data/bensound-dubstep.npy', 'data/bensound-thejazzpiano.npy' ] tags = [ 'rock', 'pop', 'alternative', 'indie', 'electronic', 'female vocalists', 'dance', '00s', 'alternative rock', 'jazz', 'beautiful', 'metal', 'chillout', 'male vocalists', 'classic rock', 'soul', 'indie rock', 'Mellow', 'electronica', '80s', 'folk', '90s', 'chill', 'instrumental', 'punk', 'oldies', 'blues', 'hard rock', 'ambient', 'acoustic', 'experimental', 'female vocalist', 'guitar', 'Hip-Hop', '70s', 'party', 'country', 'easy listening', 'sexy', 'catchy', 'funk', 'electro', 'heavy metal', 'Progressive rock', '60s', 'rnb', 'indie pop', 'sad', 'House', 'happy' ] # prepare data like this melgrams = np.zeros((0, 1, 96, 1366)) if librosa_exists: for audio_path in audio_paths: melgram = ap.compute_melgram(audio_path) melgrams = np.concatenate((melgrams, melgram), axis=0) else: for melgram_path in melgram_paths: melgram = np.load(melgram_path) melgrams = np.concatenate((melgrams, melgram), axis=0) # print melgrams # load model like this if net == 'cnn': model = MusicTaggerCNN(weights='msd', include_top=False) elif net == 'crnn': model = MusicTaggerCRNN(weights='msd', include_top=False) #model.summary() # predict the tags like this print('Predicting... with melgrams: ', melgrams.shape) start = time.time() pred_tags = model.predict(melgrams) # print like this... print("Prediction is done. It took %d seconds." % (time.time() - start)) print('Printing top-10 tags for each track...') for song_idx, audio_path in enumerate(audio_paths): sorted_result = sort_result(tags, pred_tags[song_idx, :].tolist()) print(audio_path) print(sorted_result) print(' ') return
def __getitem__(self,idx): clips_name = os.path.join(self.root_dir,self.tags_list[idx][-1]) mel = ap.compute_melgram(clips_name) mel = mel[0,:,:,:] mel = mel.transpose([1,2,0]) tags_vec = [int(i) for i in self.tags_list[idx][1:-2]] mel = self.transforms(mel) tags = torch.Tensor(tags_vec[:120]) return mel,tags
def genrePrediction(filePath): ''' *WARNIING* This model use Batch Normalization, so the prediction is affected by batch. Use multiple, different data samples together (at least 4) for reliable prediction.''' print('Running genrePrediction() with network: crnn and backend: %s' % (K._BACKEND)) # setting audio_paths = [filePath] tags = [ 'rock', 'pop', 'alternative', 'indie', 'electronic', 'female vocalists', 'dance', '00s', 'alternative rock', 'jazz', 'beautiful', 'metal', 'chillout', 'male vocalists', 'classic rock', 'soul', 'indie rock', 'Mellow', 'electronica', '80s', 'folk', '90s', 'chill', 'instrumental', 'punk', 'oldies', 'blues', 'hard rock', 'ambient', 'acoustic', 'experimental', 'female vocalist', 'guitar', 'Hip-Hop', '70s', 'party', 'country', 'easy listening', 'sexy', 'catchy', 'funk', 'electro', 'heavy metal', 'Progressive rock', '60s', 'rnb', 'indie pop', 'sad', 'House', 'happy' ] genres = [ 'rock', 'pop', 'alternative', 'indie', 'electronic', 'dance', 'alternative rock', 'jazz', 'metal', 'classic rock', 'soul', 'indie rock', 'electronica', 'folk', 'punk', 'blues', 'hard rock', 'experimental', 'Hip-Hop', 'heavy metal', 'country', 'funk', 'electro', 'Progressive rock', 'rnb', 'indie pop', 'House' ] # prepare data like this melgrams = np.zeros((0, 1, 96, 1366)) for audio_path in audio_paths: melgram = ap.compute_melgram(audio_path) melgrams = np.concatenate((melgrams, melgram), axis=0) model = MusicTaggerCRNN(weights='msd') print('Predicting...') start = time.time() pred_tags = model.predict(melgrams) print "Prediction is done. It took %d seconds." % (time.time() - start) sorted_result = sort_result(tags, pred_tags[0, :].tolist()) print(audio_path) sorted_result = filter(lambda x: x[0] in genres, sorted_result) for item in sorted_result: print(item) print(' ') print 'Total = ' + str( reduce(lambda s, el: s + float(el[1]), sorted_result, 0)) return sorted_result
def extract_melgrams(list_path, process_all_song, num_songs_genre): melgrams = np.zeros((0, 1, 96, 1366), dtype=np.float32) song_paths = open(list_path, 'r').read().splitlines() labels = list() num_frames_total = list() for song_ind, song_path in enumerate(song_paths): melgram = ap.compute_melgram(song_path) index = int(floor(song_ind / num_songs_genre)) labels.append(index) melgrams = np.concatenate((melgrams, melgram), axis=0) return melgrams, labels, num_frames_total
def main(net): ''' *WARNIING* This model use Batch Normalization, so the prediction is affected by batch. Use multiple, different data samples together (at least 4) for reliable prediction.''' print('Running main() with network: %s and backend: %s' % (net, K._BACKEND)) # setting audio_paths = [ 'data/bensound-cute.mp3', 'data/bensound-actionable.mp3', 'data/bensound-dubstep.mp3', 'data/bensound-thejazzpiano.mp3' ] melgram_paths = [ 'data/bensound-cute.npy', 'data/bensound-actionable.npy', 'data/bensound-dubstep.npy', 'data/bensound-thejazzpiano.npy' ] # prepare data like this melgrams = np.zeros((0, 1, 96, 1366)) if librosa_exists: for audio_path in audio_paths: melgram = ap.compute_melgram(audio_path) melgrams = np.concatenate((melgrams, melgram), axis=0) else: for melgram_path in melgram_paths: melgram = np.load(melgram_path) melgrams = np.concatenate((melgrams, melgram), axis=0) # load model like this if net == 'cnn': # model = MusicTaggerCNN(weights='msd', include_top=False) model = MusicTaggerCNN(weights=None, include_top=False) elif net == 'crnn': # model = MusicTaggerCRNN(weights='msd', include_top=False) model = MusicTaggerCRNN(weights=None, include_top=False) # predict the tags like this print('Predicting features...') start = time.time() features = model.predict(melgrams) # print(features[:, :10]) print(features[:, :]) print(len(features)) return
def main(num_unit, unit_number): df = pd.read_csv(annotations, delimiter="\t") with open(train_svmdata + "%d_%d" % (unit_number, num_unit), "w") as io: for row, path in enumerate(df['mp3_path']): if row % num_unit != unit_number: continue path = os.path.join("./data/mp3", path) print(unit_number, row, path) melgram = ap.compute_melgram(path) rows = melgram.shape[2] cols = melgram.shape[3] for i in range(rows): for j in range(cols): index = i * cols + j + 1 v = melgram[0, 0, i, j] io.write("%d:%f " % (index, v)) io.write("\n")
def extract_melgrams(song_paths, MULTIFRAMES, process_all_song, num_songs_genre): melgrams = np.zeros((0, 1, 96, 1366), dtype=np.float32) #song_paths = open(list_path, 'r').read().splitlines() labels = list() num_frames_total = list() for song_ind, song_path in enumerate(song_paths): print song_path if MULTIFRAMES: melgram = ap.compute_melgram_multiframe(song_path, process_all_song) num_frames = melgram.shape[0] num_frames_total.append(num_frames) print 'num frames:', num_frames else: melgram = ap.compute_melgram(song_path) melgrams = np.concatenate((melgrams, melgram), axis=0) return melgrams, num_frames_total
def main(net): ''' *WARNIING* This model use Batch Normalization, so the prediction is affected by batch. Use multiple, different data samples together (at least 4) for reliable prediction.''' print('Running main() with network: %s and backend: %s' % (net, K._BACKEND)) # setting audio_paths = [ 'data/bensound-cute.mp3', 'data/bensound-actionable.mp3', 'data/bensound-dubstep.mp3', 'data/bensound-thejazzpiano.mp3' ] melgram_paths = [ 'data/bensound-cute.npy', 'data/bensound-actionable.npy', 'data/bensound-dubstep.npy', 'data/bensound-thejazzpiano.npy' ] tags = [ 'rock', 'pop', 'alternative', 'indie', 'electronic', 'female vocalists', 'dance', '00s', 'alternative rock', 'jazz', 'beautiful', 'metal', 'chillout', 'male vocalists', 'classic rock', 'soul', 'indie rock', 'Mellow', 'electronica', '80s', 'folk', '90s', 'chill', 'instrumental', 'punk', 'oldies', 'blues', 'hard rock', 'ambient', 'acoustic', 'experimental', 'female vocalist', 'guitar', 'Hip-Hop', '70s', 'party', 'country', 'easy listening', 'sexy', 'catchy', 'funk', 'electro', 'heavy metal', 'Progressive rock', '60s', 'rnb', 'indie pop', 'sad', 'House', 'happy' ] # prepare data like this melgrams = np.zeros((0, 1, 96, 1366)) if librosa_exists: for audio_path in audio_paths: melgram = ap.compute_melgram(audio_path) melgrams = np.concatenate((melgrams, melgram), axis=0) else: for melgram_path in melgram_paths: melgram = np.load(melgram_path) melgrams = np.concatenate((melgrams, melgram), axis=0) # load model like this if net == 'cnn': model = AudioConvnet() elif net == 'rnn': model = AudioConvRNN() print('Loading weights of %s...' % net) model.load_weights('data/%s_weights_%s.h5' % (net, K._BACKEND)) # predict the tags like this print('Predicting...') start = time.time() pred_tags = model.predict(melgrams) # print like this... print "Prediction is done. It took %d seconds." % (time.time() - start) print('Printing top-10 tags for each track...') for song_idx, audio_path in enumerate(audio_paths): sorted_result = sort_result(tags, pred_tags[song_idx, :].tolist()) print(audio_path) print(sorted_result[:5]) print(sorted_result[5:10]) print(' ') return
'classic rock', 'soul', 'indie rock', 'Mellow', 'electronica', '80s', 'folk', '90s', 'chill', 'instrumental', 'punk', 'oldies', 'blues', 'hard rock', 'ambient', 'acoustic', 'experimental', 'female vocalist', 'guitar', 'Hip-Hop', '70s', 'party', 'country', 'easy listening', 'sexy', 'catchy', 'funk', 'electro', 'heavy metal', 'Progressive rock', '60s', 'rnb', 'indie pop', 'sad', 'House', 'happy'] # prepare data like this melgrams = np.zeros((0, 1, 96, 1366)) if librosa_exists: for audio_path in audio_paths: print 'extracting audio: ', audio_path melgram = ap.compute_melgram(audio_path) melgrams = np.concatenate((melgrams, melgram), axis=0) else: for melgram_path in melgram_paths: print 'loading melgram: ', melgram_path melgram = np.load(melgram_path) melgrams = np.concatenate((melgrams, melgram), axis=0) # load model like this if net == 'cnn': model = MusicTaggerCNN(weights='msd') elif net == 'crnn': # model = MusicTaggerCRNN(weights='msd') # model.save('/Users/Frank/Documents/UCSC/TIM_209/project/demo/test/mysite/trips/music-auto-tagging/music_tagger_crnn.model.h5') model = load_model(os.path.join(os.getcwd(), 'musictagger', 'music-auto-tagging', 'music_tagger_crnn.model.h5'))
def librosa_exists(): try: __import__('librosa') except ImportError: return False else: return True audio_paths = [ 'data/bensound-cute.mp3', 'data/bensound-actionable.mp3', 'data/bensound-dubstep.mp3', 'data/bensound-thejazzpiano.mp3' ] melgram_paths = [ 'data/bensound-cute.npy', 'data/bensound-actionable.npy', 'data/bensound-dubstep.npy', 'data/bensound-thejazzpiano.npy' ] tags = [ 'rock', 'pop', 'alternative', 'indie', 'electronic', 'female vocalists', 'dance', '00s', 'alternative rock', 'jazz', 'beautiful', 'metal', 'chillout', 'male vocalists', 'classic rock', 'soul', 'indie rock', 'Mellow', 'electronica', '80s', 'folk', '90s', 'chill', 'instrumental', 'punk', 'oldies', 'blues', 'hard rock', 'ambient', 'acoustic', 'experimental', 'female vocalist', 'guitar', 'Hip-Hop', '70s', 'party', 'country', 'easy listening', 'sexy', 'catchy', 'funk', 'electro', 'heavy metal', 'Progressive rock', '60s', 'rnb', 'indie pop', 'sad', 'House', 'happy' ] # prepare data like this melgrams = np.zeros((0, 1, 96, 1366)) if librosa_exists: for audio_path in audio_paths: melgram = ap.compute_melgram(audio_path) melgrams = np.concatenate((melgrams, melgram), axis=0) else: for melgram_path in melgram_paths: melgram = np.load(melgram_path) melgrams = np.concatenate((melgrams, melgram), axis=0) TH_WEIGHTS_PATH = 'https://github.com/keunwoochoi/music-auto_tagging-keras/blob/master/data/music_tagger_cnn_weights_theano.h5' weights = 'msd' input_tensor = None include_top = True if weights not in {'msd', None}: raise ValueError('The `weights` argument should be either ' '`None` (random initialization) or `msd` ' '(pre-training on Million Song Dataset).') if K.image_dim_ordering() == 'th': input_shape = (1, 96, 1366) else: input_shape = (96, 1366, 1) if input_tensor is None: melgram_input = Input(shape=input_shape) else: if not K.is_keras_tensor(input_tensor): melgram_input = Input(tensor=input_tensor, shape=input_shape) else: melgram_input = input_tensor if K.image_dim_ordering() == 'th': channel_axis = 1 freq_axis = 2 time_axis = 3 else: channel_axis = 3 freq_axis = 1 time_axis = 2 x = BatchNormalization(axis=freq_axis, name='bn_0_freq')(melgram_input) x = Convolution2D(32, 3, 3, border_mode='same', name='conv1')(x) x = BatchNormalization(axis=channel_axis, mode=0, name='bn1')(x) x = ELU()(x) x = MaxPooling2D(pool_size=(2, 4), name='pool1')(x) x = Convolution2D(64, 3, 3, border_mode='same', name='conv2')(x) x = BatchNormalization(axis=channel_axis, mode=0, name='bn2')(x) x = ELU()(x) x = MaxPooling2D(pool_size=(2, 4), name='pool2')(x) x = Convolution2D(64, 3, 3, border_mode='same', name='conv3')(x) x = BatchNormalization(axis=channel_axis, mode=0, name='bn3')(x) x = ELU()(x) x = MaxPooling2D(pool_size=(2, 4), name='pool3')(x) x = Convolution2D(64, 3, 3, border_mode='same', name='conv4')(x) x = BatchNormalization(axis=channel_axis, mode=0, name='bn4')(x) x = ELU()(x) x = MaxPooling2D(pool_size=(3, 5), name='pool4')(x) x = Convolution2D(32, 3, 3, border_mode='same', name='conv5')(x) x = BatchNormalization(axis=channel_axis, mode=0, name='bn5')(x) x = ELU()(x) x = MaxPooling2D(pool_size=(4, 4), name='pool5')(x) x = Flatten()(x) if include_top: x = Dense(50, activation='sigmoid', name='output')(x) model = Model(melgram_input, x) print(model) # if weights is None: # return model # else: # Load input # if K.image_dim_ordering() == 'tf': # raise RuntimeError("Please set image_dim_ordering == 'th'." # "You can set it at ~/.keras/keras.json") # model.load_weights('data/music_tagger_cnn_weights_%s.h5' % K._BACKEND, # by_name=True) # predict the tags like this print('Predicting...') start = time.time() pred_tags = model.predict(melgrams) # print like this... # print "Prediction is done. It took %d seconds." % (time.time()-start) print('Printing top-10 tags for each track...') for song_idx, audio_path in enumerate(audio_paths): sorted_result = sort_result(tags, pred_tags[song_idx, :].tolist()) print(audio_path) print(sorted_result[:5]) print(sorted_result[5:10]) print(' ')
# setting audio_paths = ['data/bensound-cute.mp3', 'data/bensound-actionable.mp3', 'data/bensound-dubstep.mp3' , 'data/bensound-thejazzpiano.mp3'] tags = ['rock', 'pop', 'alternative', 'indie', 'electronic', 'female vocalists', 'dance', '00s', 'alternative rock', 'jazz', 'beautiful', 'metal', 'chillout', 'male vocalists', 'classic rock', 'soul', 'indie rock', 'Mellow', 'electronica', '80s', 'folk', '90s', 'chill', 'instrumental', 'punk', 'oldies', 'blues', 'hard rock', 'ambient', 'acoustic', 'experimental', 'female vocalist', 'guitar', 'Hip-Hop', '70s', 'party', 'country', 'easy listening', 'sexy', 'catchy', 'funk', 'electro' ,'heavy metal', 'Progressive rock', '60s', 'rnb', 'indie pop', 'sad', 'House', 'happy'] # prepare data like this melgrams = np.zeros((0, 1, 96, 1366)) for audio_path in audio_paths: melgram = ap.compute_melgram(audio_path) melgrams = np.concatenate((melgrams, melgram), axis=0) # load model like this model = convnet.build_convnet_model() model.load_weights('data/weights_best.hdf5') # predict the tags like this pred_tags = model.predict(melgrams) # print like this... print('Printing top-10 tags for each track...') for song_idx, audio_path in enumerate(audio_paths): sorted_result = sort_result(tags, pred_tags[song_idx,:].tolist()) print(audio_path) print(sorted_result[:10]) print(' ')