def get_harmony_mapping(all_harmonies, chord_cutoff=64): """ all_harmonies: all harmonies of the songs; dim: [num_songs * T], element are chord strings; chord_cutoff: if chords are seen less than this cutoff, they are ignored and marked as as rests in the resulting dataset; return: harmony mapping; """ chords = {} for harmony in all_harmonies: for h in harmony: if h not in chords: chords[h] = 1 else: chords[h] += 1 chords = {c: i for c, i in chords.iteritems() if chords[c] >= chord_cutoff} if NO_CHORD not in chords.keys(): chords[NO_CHORD] = 1 # make sure NO_CHORD is in the map chord_to_idx = {c: i for i, c in enumerate(chords.keys())} idx_to_chord = {i: c for i, c in enumerate(chords.keys())} return chord_to_idx, idx_to_chord
def prepare_nottingham_pickle(time_step, chord_cutoff=64, filename=PICKLE_LOC, verbose=False): """ time_step: the time step to discretize all notes into chord_cutoff: if chords are seen less than this cutoff, they are ignored and marked as as rests in the resulting dataset filename: the location where the pickle will be saved to """ data = {} store = {} chords = {} max_seq = 0 seq_lens = [] for d in ["train", "test", "valid"]: parsing_msg = "Parsing {}...".format(d) print(parsing_msg) parsed = parse_nottingham_directory("data/Nottingham/{}".format(d), time_step, verbose=False) metadata = [s[0] for s in parsed] seqs = [s[1] for s in parsed] data[d] = seqs data[d + '_metadata'] = metadata lens = [len(s[1]) for s in seqs] seq_lens += lens max_seq = max(max_seq, max(lens)) for _, harmony in seqs: for h in harmony: if h not in chords: chords[h] = 1 else: chords[h] += 1 avg_seq = float(sum(seq_lens)) / len(seq_lens) chords = {c: i for c, i in chords.iteritems() if chords[c] >= chord_cutoff} chord_mapping = {c: i for i, c in enumerate(chords.keys())} num_chords = len(chord_mapping) store['chord_to_idx'] = chord_mapping if verbose: pprint(chords) print("Number of chords: {}".format(num_chords)) print("Max Sequence length: {}".format(max_seq)) print("Avg Sequence length: {}".format(avg_seq)) print("Num Sequences: {}".format(len(seq_lens))) def combine(melody, harmony): full = np.zeros( (melody.shape[0], NOTTINGHAM_MELODY_RANGE + num_chords)) assert melody.shape[0] == len(harmony) # for all melody sequences that don't have any notes, add the empty melody marker (last one) for i in range(melody.shape[0]): if np.count_nonzero(melody[i, :]) == 0: melody[i, NOTTINGHAM_MELODY_RANGE - 1] = 1 # all melody encodings should now have exactly one 1 for i in range(melody.shape[0]): assert np.count_nonzero(melody[i, :]) == 1 # add all the melodies full[:, :melody.shape[1]] += melody harmony_idxs = [ chord_mapping[h] if h in chord_mapping else chord_mapping[NO_CHORD] \ for h in harmony ] harmony_idxs = [NOTTINGHAM_MELODY_RANGE + h for h in harmony_idxs] full[np.arange(len(harmony)), harmony_idxs] = 1 # all full encodings should have exactly two 1's for i in range(full.shape[0]): assert np.count_nonzero(full[i, :]) == 2 return full for d in ["train", "test", "valid"]: print("Combining {}".format(d)) store[d] = [combine(m, h) for m, h in data[d]] store[d + '_metadata'] = data[d + '_metadata'] with open(filename, 'w') as f: cPickle.dump(store, f, protocol=-1) return True
if c not in chords: chords[c] = 1 else: chords[c] += 1 #Calculate average length, which may be used for identifying batch timestep length avg_seq = float(sum(sequenceLength)) / len(sequenceLength) #Prepare chord index for harmony one hot vector chordLimit = 64 # calculate chords and frequences chords = { chord: ind for chord, ind in chords.iteritems() if chords[chord] >= chordLimit } chordMap = {chord: ind for ind, chord in enumerate(chords.keys())} numChords = len(chordMap) # append chord index to final final['chordIx'] = chordMap #plot the chord distribution chart pprint(chords) plt.figure(figsize=(10, 4)) plt.bar(range(len(chords)), chords.values()) plt.xticks(range(len(chords)), chords.keys()) plt.show() #print sequence information print "Total sequences parsed: {}".format(len(sequenceLength)) print "Maximum length of sequences: {}".format(sequenceMax)
def prepare_nottingham_pickle(time_step, chord_cutoff=64, filename=PICKLE_LOC, verbose=False): """ time_step: the time step to discretize all notes into chord_cutoff: if chords are seen less than this cutoff, they are ignored and marked as as rests in the resulting dataset filename: the location where the pickle will be saved to """ data = {} store = {} chords = {} max_seq = 0 seq_lens = [] for d in ["train", "test", "valid"]: print "Parsing {}...".format(d) parsed = parse_nottingham_directory("data/Nottingham/{}".format(d), time_step, verbose=False) metadata = [s[0] for s in parsed] seqs = [s[1] for s in parsed] data[d] = seqs data[d + '_metadata'] = metadata lens = [len(s[1]) for s in seqs] seq_lens += lens max_seq = max(max_seq, max(lens)) for _, harmony in seqs: for h in harmony: if h not in chords: chords[h] = 1 else: chords[h] += 1 avg_seq = float(sum(seq_lens)) / len(seq_lens) chords = { c: i for c, i in chords.iteritems() if chords[c] >= chord_cutoff } chord_mapping = { c: i for i, c in enumerate(chords.keys()) } num_chords = len(chord_mapping) store['chord_to_idx'] = chord_mapping if verbose: pprint(chords) print "Number of chords: {}".format(num_chords) print "Max Sequence length: {}".format(max_seq) print "Avg Sequence length: {}".format(avg_seq) print "Num Sequences: {}".format(len(seq_lens)) def combine(melody, harmony): full = np.zeros((melody.shape[0], NOTTINGHAM_MELODY_RANGE + num_chords)) assert melody.shape[0] == len(harmony) # for all melody sequences that don't have any notes, add the empty melody marker (last one) for i in range(melody.shape[0]): if np.count_nonzero(melody[i, :]) == 0: melody[i, NOTTINGHAM_MELODY_RANGE-1] = 1 # all melody encodings should now have exactly one 1 for i in range(melody.shape[0]): assert np.count_nonzero(melody[i, :]) == 1 # add all the melodies full[:, :melody.shape[1]] += melody harmony_idxs = [ chord_mapping[h] if h in chord_mapping else chord_mapping[NO_CHORD] \ for h in harmony ] harmony_idxs = [ NOTTINGHAM_MELODY_RANGE + h for h in harmony_idxs ] full[np.arange(len(harmony)), harmony_idxs] = 1 # all full encodings should have exactly two 1's for i in range(full.shape[0]): assert np.count_nonzero(full[i, :]) == 2 return full for d in ["train", "test", "valid"]: print "Combining {}".format(d) store[d] = [ combine(m, h) for m, h in data[d] ] store[d + '_metadata'] = data[d + '_metadata'] with open(filename, 'w') as f: cPickle.dump(store, f, protocol=-1) return True
# if rem > 0: # padStep = [] # for _ in range(rem): # padStep += [[0,0,0,0]] # tStepFinal += [padStep+ tStepVec] # else: # tStepFinal += ([tStepVec]) # data[d + '_countFB'] = tStepFinal #Calculate average length, which may be used for identifying batch timestep length avg_seq = float(sum(seq_lens)) / len(seq_lens) #Prepare chord index for harmony one hot vector chords = { c: i for c, i in chords.iteritems() if chords[c] >= chordLimit } chord_mapping = { c: i for i, c in enumerate(chords.keys()) } num_chords = len(chord_mapping) store['chord_to_idx'] = chord_mapping #plot the chord distribution chart pprint(chords) plt.figure(figsize=(10, 4)) plt.bar(range(len(chords)), chords.values()) plt.xticks(range(len(chords)), chords.keys()) plt.show() #print sequence information print "Number of chords: {}".format(num_chords) print "Max Sequence length: {}".format(max_seq) print "Avg Sequence length: {}".format(avg_seq) print "Num Sequences: {}".format(len(seq_lens))