Example #1
0
def plot_histogram_per_iteration(data_file,
                                 feature='grade',
                                 plt_dir='plots/augmented-generation/',
                                 threshold=None):
    """
    visualize model updates by plotting histogram for grade distribution at each iteration
    """
    # read update data as dictionary
    data_dict = read_training_data(data_file=data_file,
                                   feature=feature,
                                   threshold=threshold)

    plt.figure(figsize=(20, 10))
    plt.style.use('seaborn-whitegrid')
    for it, data in data_dict.items():
        plt.subplot(2, 5, it)
        plt.hist(data, alpha=0.7)
        plt.xlabel(feature)
        plt.title(f'Iteration {it}')

    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    plt.suptitle(
        f'{feature} Distribution of Generations at Each Iteration of Training',
        fontsize=20)
    ensure_dir(plt_dir)
    plt.savefig(os.path.join(plt_dir, f'{feature}_update_dist.png'))
    plt.close()
Example #2
0
def plot_violinplots(data_dict, plt_title, plt_dir):
    plt.figure()
    fig, ax = plt.subplots(figsize=(7.2, 4.4))
    plt.style.use('seaborn-whitegrid')
    ax.grid(False)
    r = ax.violinplot(list(data_dict.values()),
                      showmeans=True,
                      showmedians=True)
    r['cmedians'].set_label('Median grade')
    r['cmedians'].set_color('rebeccapurple')
    r['cmeans'].set_label('Mean grade')
    r['cmeans'].set_color('steelblue')
    ax.get_xaxis().set_tick_params(direction='out')
    ax.xaxis.set_ticks_position('bottom')
    ax.set_xticks(np.arange(1, len(data_dict.keys()) + 1))
    ax.set_xticklabels(data_dict.keys(), fontsize=11)
    ax.set_ylabel('Grade', fontsize=11)
    plt.title(plt_title)
    plt.text(0.2, 1, 'better')
    plt.text(0.2, 46, 'worse')
    plt.text(0.75, 17, f'Median: ' + r'$4.91$' + '\n    ' + r'$\sigma: 1.63$')
    plt.text(1.75, 27, f'Median: ' + r'$8.02$' + '\n    ' + r'$\sigma: 2.92$')
    plt.text(2.75, 42, f'Median: ' + r'$10.62$' + '\n    ' + r'$\sigma: 5.57$')
    plt.text(3.75, 12, f'Median: ' + r'$24.10$' + '\n    ' + r'$\sigma: 7.96$')
    plt.legend(loc='upper left')
    ensure_dir(plt_dir)
    plt.tight_layout()
    plt.savefig(os.path.join(plt_dir, f'grade_violinplot'))
    plt.close()
def plot_repeated_sequence_histogram(repeated_sequence_counter, plt_dir, plt_name):
    """
    Arguments
        repeated_sequence_histogram: sequence_table representing the chorale
        plt_dir: output directory
        plt_name: file name
    
    plot the distribution/histogram for a chorale (can be a "mean" chorale)
    """
    plt.figure()
    ensure_dir(plt_dir)
    plt.bar(repeated_sequence_counter.keys(), repeated_sequence_counter.values())
    if len(list(repeated_sequence_counter.keys())) == 0:
        max_seq = 0
    else:
        max_seq = np.max(list(repeated_sequence_counter.keys()))
    xticks = range(1,(max_seq+4)//4,1)
    plt.xticks(ticks=[tick*4 for tick in xticks], labels=xticks)
    plt.xlim([0, max_seq+1])
    plt.xlabel('Sequence length (beats)')
    if np.sum(repeated_sequence_counter.values()) in [1-1e-8, 1+1e-8]:
        plt.ylabel('Proportion of repeated sequences')
    else:
        plt.ylabel('Count of repeated sequences')
    plt.savefig(f'{plt_dir}/{plt_name}.png')
    plt.close()
Example #4
0
def plot_boxplot_per_epoch(
        data_file='results/update_grades_over_bach_chorales.csv',
        feature='grade',
        plt_dir='plots/augmented-generation/',
        threshold=None):
    """
    Arguments
        data_file: file containing upgrade grades
        feature: feature of interest (either overall grade or a feature distance)
        plt_dir: directory to save plots
        threshold: lower threshold for inclusion

    visualize model updates by plotting boxplot for grade distribution at each epoch
    """
    # read update data as dictionary
    data_dict = read_training_data(data_file=data_file, feature=feature)

    # plot
    plt.figure()
    plt.style.use('seaborn-whitegrid')
    plt.rc('xtick', labelsize=11)
    plt.rc('ytick', labelsize=11)
    plt.rc('axes', titlesize=13)
    fig, ax = plt.subplots(figsize=(8, 5))
    ax.xaxis.grid(False)
    ax.boxplot(list(data_dict.values()))
    ax.set_xticks([i + 1 for i in data_dict.keys()])
    ax.set_xticklabels([str(i) for i in data_dict.keys()])
    for label in ax.get_xaxis().get_ticklabels()[1::2]:
        label.set_visible(False)
    ylabel0 = ax.get_yaxis().get_ticklabels()[0]
    ylabel0.set_visible(False)
    plt.xlabel('Epoch')
    plt.title(
        f'{feature.capitalize()} Distribution of Generations During Aug-Gen Training'
    )
    plt.text(-2.2, 1, 'better')
    plt.text(-2.2, 47, 'worse')
    plt.ylabel(feature.capitalize())
    plt.ylim([0, 49.15])

    threshold = get_threshold(
        data_file='experiments/ablations/reg_pe_no_oe/bach_grades.csv',
        column='grade',
        aggregate='75p',
    )
    plt.axhline(y=threshold,
                color='steelblue',
                linestyle='-.',
                label=r'$Q_3$' + ' of Bach grades')
    plt.legend(loc='upper right')

    ensure_dir(plt_dir)
    fig.tight_layout()
    plt.savefig(os.path.join(plt_dir, f'{feature}_update_boxplots.png'))
Example #5
0
    def load_or_pickle_distributions(self):
        voice_ranges_file = 'voice_ranges.txt'
        distributions_file = 'bach_distributions.txt'
        error_note_ratio_file = 'error_note_ratio.txt'
        parallel_error_note_ratio_file = 'parallel_error_note_ratio.txt'
        dist_files = [
            distributions_file, error_note_ratio_file,
            parallel_error_note_ratio_file
        ]
        gaussian_file = 'gaussian.txt'

        # create pickle_dir if it does not exists
        if not os.path.exists(self.pickle_dir):
            ensure_dir(self.pickle_dir)
            with open(f'{self.pickle_dir}/features.txt', 'w') as readme:
                readme.write('Features:\n')
                readme.write('\n'.join(self.features))
        # compute or load voice ranges
        if os.path.exists(f'{self.pickle_dir}/{voice_ranges_file}'):
            with open(f'{self.pickle_dir}/{voice_ranges_file}', 'rb') as fin:
                self.voice_ranges = pickle.load(fin)
        else:
            self.compute_voice_ranges(self.iterator, 4)
            with open(f'{self.pickle_dir}/{voice_ranges_file}', 'wb') as fo:
                pickle.dump(self.voice_ranges, fo)
        # compute or load distributions
        if np.all(
            [os.path.exists(f'{self.pickle_dir}/{f}') for f in dist_files]):
            with open(f'{self.pickle_dir}/{distributions_file}', 'rb') as fin:
                self.distributions = pickle.load(fin)
            with open(f'{self.pickle_dir}/{error_note_ratio_file}',
                      'rb') as fin:
                self.error_note_ratio = pickle.load(fin)
            with open(f'{self.pickle_dir}/{parallel_error_note_ratio_file}',
                      'rb') as fin:
                self.parallel_error_note_ratio = pickle.load(fin)
        else:
            self.calculate_distributions()
            with open(f'{self.pickle_dir}/{distributions_file}', 'wb') as fo:
                pickle.dump(self.distributions, fo)
            with open(f'{self.pickle_dir}/{error_note_ratio_file}',
                      'wb') as fo:
                pickle.dump(self.error_note_ratio, fo)
            with open(f'{self.pickle_dir}/{parallel_error_note_ratio_file}',
                      'wb') as fo:
                pickle.dump(self.parallel_error_note_ratio, fo)
        # compute or load Gaussian
        if os.path.exists(f'{self.pickle_dir}/{gaussian_file}'):
            with open(f'{self.pickle_dir}/{gaussian_file}', 'rb') as fin:
                self.gaussian = pickle.load(fin)
        else:
            self.fit_gaussian()
            with open(f'{self.pickle_dir}/{gaussian_file}', 'wb') as fo:
                pickle.dump(self.gaussian, fo)
Example #6
0
def plot_boxplots(data_dict, plt_title, plt_dir):
    plt.figure()
    fig, ax = plt.subplots()
    plt.style.use('seaborn-whitegrid')
    ax.xaxis.grid(False)
    ax.boxplot(list(data_dict.values()))
    ax.set_xticklabels(data_dict.keys())
    plt.ylabel('Grade')
    plt.title(plt_title)
    ensure_dir(plt_dir)
    plt.savefig(os.path.join(plt_dir, f'grade_boxplot'))
    plt.close()
def grade_unconstrained_mock(grader,
                             transformer,
                             output_dir=None,
                             num_generations=1):
    """
    Arguments:
        grader: Grader object
        transformer: model for generation
        grades_csv: csv file to write grades to
        num_generations: number of generations
    
    Usage example:
        grade_unconstrained_mock(grader=grader,
                                 transformer=transformer,
                                 grades_csv='results/unconstrained_mock_grades.csv',
                                 num_generations=351)
    """
    print('Generating and grading unconstrained mock chorales')
    mock_grades = []

    # calculate batch sizes
    batch_sizes = [max_batch_size] * (num_generations // max_batch_size)
    if num_generations % max_batch_size != 0:
        batch_sizes += [num_generations % max_batch_size]

    mock_scores = []
    for i in tqdm(range(len(batch_sizes))):
        score_batch = transformer.generate(temperature=0.9,
                                           top_p=0.8,
                                           batch_size=batch_sizes[i])
        mock_scores.extend(score_batch)

    for i, score in enumerate(mock_scores):
        # write score to XML
        if output_dir is None:
            output_dir = f'{transformer.model_dir}/unconstrained_mocks/'
        ensure_dir(output_dir)
        score.write('xml', f'{output_dir}/{i}.xml')

        # grade chorale
        grade, chorale_vector = grader.grade_chorale(score)
        mock_grades.append([grade, *chorale_vector])

    print('Writing data to csv file')
    grades_file = open(f'{output_dir}/grades.csv', 'w')
    reader = csv.writer(grades_file)
    reader.writerow(['', 'grade'] + FEATURES)
    for i, grades in enumerate(mock_grades):
        reader.writerow([i, *grades])
        grades_file.flush()
    grades_file.close()
def grade_constrained_mock(
    grader,
    transformer,
    bach_iterator=None,
    output_dir=None,
    num_generations=1,
):
    """
    Arguments:
        grader: Grader object
        transformer: model for generation
        bach_iterator: iterator containing Bach chorales
    """
    print('Generating and grading constrained mock chorales')
    mock_grades = []

    for i, bach_score in tqdm(enumerate(bach_iterator)):
        bach_melody = score_to_hold_representation_for_voice(bach_score,
                                                             voice=0)
        try:
            mock_score = transformer.generate(temperature=0.9,
                                              top_p=0.8,
                                              batch_size=1,
                                              melody_constraint=bach_melody,
                                              hard_constraint=True)[0]
        # IndexError: index 96 is out of bounds for dimension 1 with size 96 on line 504
        except IndexError:
            print(f'chorale {i} is problem')
            mock_grades.append([float('-inf')])
            continue

        # write mock_score to XML
        if output_dir is None:
            output_dir = f'{transformer.model_dir}/constrained_mocks/'
        ensure_dir(output_dir)
        mock_score.write('xml', f'{output_dir}/{i}.xml')

        # grade chorale
        grade, chorale_vector = grader.grade_chorale(mock_score)
        mock_grades.append([grade, *chorale_vector])

        if i >= num_generations:
            break

    print('Writing data to csv file')
    with open(f'{output_dir}/grades.csv', 'w') as chorale_file:
        reader = csv.writer(chorale_file)
        reader.writerow(['', 'grade'] + FEATURES)
        for i, grades in enumerate(mock_grades):
            reader.writerow([i, *grades])
Example #9
0
def plot_histograms(data_dict,
                    plt_title,
                    plt_dir,
                    plt_name=None,
                    threshold=None):
    """
    Arguments
        data_dict: a dictionary of data with key as label and value as list of grades/distances
            {'Bach chorales': [10, 12, ...], 'Generations': [20, 15, ...]}
        feature: feature of interest (either overall grade or a feature distance)
        plt_title: title of plot
        plt_dir: directory to save plots
        threshold: lower threshold for inclusion

    compare grade distributions as boxplot and as histogram
    """
    # remove grades of -inf
    for label in data_dict:
        data_dict[label] = [x for x in data_dict[label] if x != float('-inf')]

    if threshold is not None:
        data_dict[label] = [x for x in data_dict[label] if x < threshold]

    bins = np.histogram(list(chain.from_iterable(data_dict.values())),
                        bins=100)[1]

    plt.figure()
    fig, ax = plt.subplots(figsize=(6, 4))
    plt.style.use('seaborn-whitegrid')
    ax.xaxis.grid(False)
    for label, data in data_dict.items():
        plt.hist(data, label=label, alpha=0.5, bins=bins)
    plt.xlabel('Grade')
    plt.ylabel('Frequency')
    plt.title(plt_title)
    plt.legend()
    ensure_dir(plt_dir)
    if plt_name is None:
        plt_name = 'grade_dist'
    fig.tight_layout()
    plt.savefig(os.path.join(plt_dir, f'{plt_name}.png'))
    plt.close()
Example #10
0
def plot_learning_curves(gen_folder):
    with open(f'{gen_folder}/loss.csv', 'r') as fin:
        df = pd.read_csv(fin)
        train_loss = df['train_loss']
        val_loss = df['val_loss']

    print(df[df['val_loss'] == df['val_loss'].min()])

    plt.figure()
    fig, ax = plt.subplots()
    plt.style.use('seaborn-whitegrid')
    ax.grid(False)
    plt.plot(train_loss)
    plt.plot(val_loss)
    plt.legend(['Training loss', 'Validation loss'])
    plt.title('Training curves')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    ensure_dir(f'{gen_folder}/plots')
    plt.savefig(f'{gen_folder}/plots/training_curves.png')
    # 'note_self_similarity': note_self_similarity
    # 'original_seq1': FEATURES + ['repeated_sequence'],
    # 'sequence_1': ['sequence_1'],
    # 'sequence_2': ['repeated_sequence_2'],
    # 'self_similarity': ['self_similarity'],
    'reg_pe_no_oe': FEATURES
}


for ablation in ALL_ABLATIONS:
    print(f'----- current ablation: {ablation} -----')
    features = ALL_ABLATIONS[ablation]
    weights = [1] * len(features)
    
    PICKLE_DIR = ablation
    ensure_dir(f'{ABLATIONS_DIR}/{PICKLE_DIR}/')

    grader = Grader(
        features=features,
        iterator=None,
    )
    
    bach_df = pd.read_csv(f'{ABLATIONS_DIR}/{PICKLE_DIR}/{BACH_GRADES_CSV}')
    mock_df = pd.read_csv(f'{ABLATIONS_DIR}/{PICKLE_DIR}/{MOCK_GRADES_CSV}').dropna()
    # bach_vectors = bach_df[features].values.tolist()
    # mock_vectors = mock_df[features].values.tolist()

    # bach_grades = []
    # for bach_vector in bach_vectors:
    #     bach_grades.append(np.dot(weights, bach_vector))
def main(
    train,
    load,
    aug_gen,
    base,
    generate,
    overfitted,
    epoch,
    config,
    description,
    num_workers,
):
    # Use all gpus available
    gpu_ids = [int(gpu) for gpu in range(torch.cuda.device_count())]
    print(f'Using GPUs {gpu_ids}')

    # Load config
    config_path = config
    config_module_name = os.path.splitext(config)[0].replace('/', '.')
    config = importlib.import_module(config_module_name).config

    from experiments.augmentative_generation import augmentative_generation
    from experiments.generate_and_grade import grade_folder, grade_constrained_mock, grade_unconstrained_mock
    from Grader.grader import Grader, FEATURES
    from Grader.helpers import get_threshold

    # set random seed
    seed(config['random_seed'])

    # compute time stamp
    if config['timestamp'] is not None:
        timestamp = config['timestamp']
    else:
        timestamp = datetime.now().strftime('%m-%d_%H:%M')
        config['timestamp'] = timestamp

    # set model_dir
    if load:
        model_dir = os.path.dirname(config_path)
    else:
        if config['savename'] is None:
            if aug_gen:
                config['savename'] = 'aug-gen'
            elif base:
                config['savename'] = 'base'
            else:
                config['savename'] = 'model'
        model_dir = f'models/{config["savename"]}_{timestamp}'

    # === Decoder ====
    print('Parsing XML Bach dataset')
    bach_dataset = [
        parse_xml(f'chorales/bach_chorales/{i}.xml') for i in tqdm(range(351))
    ]
    num_examples = len(bach_dataset)
    split = [0.8, 0.2]
    train_dataset = bach_dataset[:int(split[0] * num_examples)]
    val_dataset = bach_dataset[int(split[0] * num_examples):]
    dataloader_generator_kwargs = config['dataloader_generator_kwargs']

    train_dataloader_generator = SmallBachDataloaderGenerator(
        dataset_name='bach_train',
        chorales=train_dataset,
        include_transpositions=dataloader_generator_kwargs[
            'include_transpositions'],
        sequences_size=dataloader_generator_kwargs['sequences_size'],
    )

    val_dataloader_generator = SmallBachDataloaderGenerator(
        dataset_name='bach_val',
        chorales=val_dataset,
        include_transpositions=dataloader_generator_kwargs[
            'include_transpositions'],
        sequences_size=dataloader_generator_kwargs['sequences_size'],
    )

    data_processor = get_data_processor(
        dataloader_generator=train_dataloader_generator,
        data_processor_type=config['data_processor_type'],
        data_processor_kwargs=config['data_processor_kwargs'])

    decoder_kwargs = config['decoder_kwargs']
    num_channels = 4  # is this number of voices?
    num_events_grouped = 4
    num_events = dataloader_generator_kwargs['sequences_size'] * 4
    transformer = TransformerBach(
        model_dir=model_dir,
        train_dataloader_generator=train_dataloader_generator,
        val_dataloader_generator=val_dataloader_generator,
        data_processor=data_processor,
        d_model=decoder_kwargs['d_model'],
        num_encoder_layers=decoder_kwargs['num_encoder_layers'],
        num_decoder_layers=decoder_kwargs['num_decoder_layers'],
        n_head=decoder_kwargs['n_head'],
        dim_feedforward=decoder_kwargs['dim_feedforward'],
        dropout=decoder_kwargs['dropout'],
        positional_embedding_size=decoder_kwargs['positional_embedding_size'],
        num_channels=num_channels,
        num_events=num_events,
        num_events_grouped=num_events_grouped)

    if load:
        if overfitted:
            transformer.load(early_stopped=False)
        elif epoch:
            transformer.load(epoch=epoch)
        else:
            transformer.load(early_stopped=True)
        transformer.to('cuda')

    # copy .py config file and create README in the model directory before training
    if not load:
        ensure_dir(model_dir)
        shutil.copy(config_path, f'{model_dir}/config.py')
        transformer.to('cuda')

        with open(f'{model_dir}/README.txt', 'w') as readme:
            readme.write(description)
            readme.close()

    grader = Grader(
        features=FEATURES,
        iterator=bach_dataset,
    )

    if train:
        transformer.train_model(
            batch_size=config['batch_size'],
            num_batches=config['num_batches'],
            num_epochs=config['num_epochs'],
            lr=config['lr'],
            plot=True,
            num_workers=num_workers,
        )

    if aug_gen:
        threshold = get_threshold(
            data_file='experiments/ablations/reg_pe_no_oe/bach_grades.csv',
            column='grade',
            aggregate='75p',
        )
        augmentative_generation(
            transformer=transformer,
            grader=grader,
            config=config,
            num_workers=num_workers,
            bach_iterator=train_dataset,
            threshold=threshold,
        )

    if base:
        # base model
        augmentative_generation(transformer=transformer,
                                grader=grader,
                                config=config,
                                num_workers=num_workers,
                                bach_iterator=train_dataset,
                                threshold=float('inf'))

    if generate:
        grade_constrained_mock(
            grader=grader,
            transformer=transformer,
            output_dir=f'{transformer.model_dir}/constrained_mocks',
            bach_iterator=bach_dataset,
            num_generations=1,
        )
"""
Create directory of 351 4-part Bach chorales in chorales/bach_chorales/ in XML from music21
File name's index number may not correspond to Riemenschneider index number for chorale.
"""

import sys
sys.path[0] += '/../'

import music21
from tqdm import tqdm
from transformer_bach.utils import ensure_dir

i = 0
ensure_dir('chorales')
ensure_dir('chorales/bach_chorales')
for chorale in tqdm(music21.corpus.chorales.Iterator(1, 371)):
    if len(chorale.parts) == 4:
        for n in chorale.recurse().getElementsByClass('Note'):
            n.lyric = None  # remove lyrics
            n.expressions = []  # remove fermatas
        chorale.write('xml', f'chorales/bach_chorales/{i}.xml')
        i += 1