def segment_many(index,
                 audio_files,
                 mode,
                 output_directory,
                 num_cpus=-1,
                 verbose=0):
    """Segment a collection of audio files.

    Parameters
    ----------
    index : array_like
        Index values for the audio files; must be unique!

    audio_files : array_like
        Collection of audio filepaths.

    output_directory : str
        Path at which outputs will be written.

    num_cpus : int, default=-1
        Number of CPUs to use for parallel downloads; -1 for all.

    verbose : int, default=0
        Verbosity level for parallel computation.

    Returns
    -------
    output_paths : list
        Filepaths of generated output, or None for failures.
    """
    if len(set(index)) < len(index):
        raise ValueError("All index values must be unique.")

    logger.info("beginning segmenting {} files with mode={}"
                "".format(len(index), mode))
    utils.create_directory(output_directory)
    pool = Parallel(n_jobs=num_cpus, verbose=verbose)
    fx = delayed(segment_one)
    return pool(
        fx(idx, afile, mode, output_directory)
        for idx, afile in zip(index, audio_files))
def segment_one(index, audio_file, mode, output_directory):
    """Segment a single audio file.

    Parameters
    ----------
    index : str
        Index value corresponding to this audio file.

    audio_file : str
        Path to an audio file on disk.

    mode : str
        Segmentation mode to use; see minst.signal.ONSETS for more details.

    output_directory : str
        Path at which to write outputs.

    Returns
    -------
    output_file : str
        Path at which data was written.
    """
    t0_segment = time.time()
    oframe = S.segment(audio_file, mode)
    time_to_segment = time.time() - t0_segment

    t0_write = time.time()
    output_file = os.path.join(output_directory, "{}.csv".format(index))
    oframe.to_csv(output_file)
    time_to_write = time.time() - t0_write
    if not os.path.exists(output_file):
        raise ValueError("Did not create output! {}".format(output_file))

    logger.info("Segmentation complete \n\taudio: {} \n\t-> segments: {} \n"
                "\t[seg_time: {}] [write_time: {}]"
                .format(audio_file, output_file,
                        time_to_segment, time_to_write))

    return output_file
def segment_one(index, audio_file, mode, output_directory):
    """Segment a single audio file.

    Parameters
    ----------
    index : str
        Index value corresponding to this audio file.

    audio_file : str
        Path to an audio file on disk.

    mode : str
        Segmentation mode to use; see minst.signal.ONSETS for more details.

    output_directory : str
        Path at which to write outputs.

    Returns
    -------
    output_file : str
        Path at which data was written.
    """
    t0_segment = time.time()
    oframe = S.segment(audio_file, mode)
    time_to_segment = time.time() - t0_segment

    t0_write = time.time()
    output_file = os.path.join(output_directory, "{}.csv".format(index))
    oframe.to_csv(output_file)
    time_to_write = time.time() - t0_write
    if not os.path.exists(output_file):
        raise ValueError("Did not create output! {}".format(output_file))

    logger.info("Segmentation complete \n\taudio: {} \n\t-> segments: {} \n"
                "\t[seg_time: {}] [write_time: {}]".format(
                    audio_file, output_file, time_to_segment, time_to_write))

    return output_file
def segment_many(index, audio_files, mode, output_directory,
                 num_cpus=-1, verbose=0):
    """Segment a collection of audio files.

    Parameters
    ----------
    index : array_like
        Index values for the audio files; must be unique!

    audio_files : array_like
        Collection of audio filepaths.

    output_directory : str
        Path at which outputs will be written.

    num_cpus : int, default=-1
        Number of CPUs to use for parallel downloads; -1 for all.

    verbose : int, default=0
        Verbosity level for parallel computation.

    Returns
    -------
    output_paths : list
        Filepaths of generated output, or None for failures.
    """
    if len(set(index)) < len(index):
        raise ValueError("All index values must be unique.")

    logger.info("beginning segmenting {} files with mode={}"
                "".format(len(index), mode))
    utils.create_directory(output_directory)
    pool = Parallel(n_jobs=num_cpus, verbose=verbose)
    fx = delayed(segment_one)
    return pool(fx(idx, afile, mode, output_directory)
                for idx, afile in zip(index, audio_files))
Esempio n. 5
0
def annotate_one(row, data_dir, skip_existing=False):
    """Create a canvas from the row information to annotate
    a single audio_file. If a previous onset file exists, loads it first.

    Parameters
    ----------
    row : pd.Series

    data_dir : str
        Path to where to put index files if none given.

    skip_existing : bool
        If edited files exist, skip over them.
    """
    # audio_file, onset_file, output_file=None, title=None,
    logger.info("Annotating:\n{} [idx={}]".format(row.audio_file,
                                                  row.name))

    # Safely try to load the onset_file
    if pd.isnull(row.onsets_file) or not os.path.exists(row.onsets_file):
        onsets = pd.DataFrame([])
        output_file = os.path.join(data_dir, "{}-fix.csv".format(row.name))
        logger.debug("No pre-existing onsets file exists for {}".format(
            row.audio_file))
    else:
        onsets = pd.read_csv(row.onsets_file)
        output_file = row.onsets_file.replace(".csv", "-fix.csv")

    title = "{}| instrument: {}".format(row.name, row['instrument'])
    logger.info("Title: {}".format(title))

    if os.path.exists(output_file):
        if skip_existing:
            return False, False
    t0 = time.time()

    canvas = OnsetCanvas(row.audio_file, output_file, onsets,
                         title=title)
    logger.info("Writing to: {}".format(output_file))
    plt.show(block=True)

    # Should we continue?
    t_end = time.time() - t0
    logger.info("Took {}s to work on {}".format(t_end, row.audio_file))
    return canvas.quit, canvas._mark_for_later
Esempio n. 6
0
    def on_key_press(self, event):
        """Handle key-press events.

        Catches the following:
            x : Write current onset data and close this canvas.
            w : Write current onset data.
            q : Close this canvas without saving.
            spacebar : Toggle a marker at the current mouse position.
            c : Clear all current onsets
        """
        logger.debug('Received: {}'.format(event.key))
        sys.stdout.flush()
        if event.key == 'Q':
            logger.info("Quitting")
            plt.close()
            self._alive = False
            self._quit = True

        elif event.key == 'x':
            logger.info("Saving to: {}".format(self.output_file))
            self.save_onsets()
            plt.close()
            self._alive = False

        elif event.key == 'w':
            logger.info("Saving to: {}".format(self.output_file))
            self.save_onsets()

        elif event.key == 'q':
            logger.info("Closing")
            plt.close()
            self._alive = False

        elif event.key == 'c':
            logger.info("Clearing existing markers")
            self.clear_onsets()

        elif event.key == 'm':
            logger.info("Marking for later")
            self._mark_for_later = True
            plt.close()
            self._alive = False

        elif event.key == ' ':
            x, y = event.xdata, event.ydata
            logger.debug('({:4}, {:4})'.format(x, y))
            self.modify_onsets_at_time(x)

        elif event.key == 'd':
            # Delete in a larger range (1s)
            x, y = event.xdata, event.ydata
            logger.info('({:4}, {:4})'.format(x, y))
            self.modify_onsets_at_time(x, threshold=1.0, create=False)

        elif event.key == 'D':
            # Delete in an even larger range (5s)
            x, y = event.xdata, event.ydata
            logger.info('({:4}, {:4})'.format(x, y))
            self.modify_onsets_at_time(x, threshold=5.0, create=False)

        # Use onset detectors to get started
        elif event.key == '1':
            logger.debug("Getting envelope_onsets(.008)")
            onsets = S.envelope_onsets(self.x, self.fs,
                                       wait=int(self.fs * .008))
            self.set_onset_data(pd.DataFrame(dict(time=onsets)))

        elif event.key == '2':
            # Reset onsets with "envelope_onsets"
            logger.debug("Getting envelope_onsets(.01)")
            onsets = S.envelope_onsets(self.x, self.fs,
                                       wait=int(self.fs * .01))
            self.set_onset_data(pd.DataFrame(dict(time=onsets)))

        elif event.key == '3':
            # Reset onsets with "envelope_onsets"
            logger.debug("Getting envelope_onsets(.02)")
            onsets = S.envelope_onsets(self.x, self.fs,
                                       wait=int(self.fs * .02))
            self.set_onset_data(pd.DataFrame(dict(time=onsets)))

        elif event.key == '4':
            # Reset onsets with "envelope_onsets"
            logger.debug("Getting envelope_onsets(.05)")
            onsets = S.envelope_onsets(self.x, self.fs,
                                       wait=int(self.fs * .05))
            self.set_onset_data(pd.DataFrame(dict(time=onsets)))

        elif event.key == '6':
            # Reset onsets with "logcqt_onsets"
            logger.debug("Getting logcqt_onsets()")
            onsets = S.logcqt_onsets(self.x, self.fs,
                                     wait=int(self.fs * .01))
            self.set_onset_data(pd.DataFrame(dict(time=onsets)))

        elif event.key == '7':
            # Reset onsets with "logcqt_onsets"
            logger.debug("Getting logcqt_onsets()")
            onsets = S.logcqt_onsets(self.x, self.fs,
                                     wait=int(self.fs * .02))
            self.set_onset_data(pd.DataFrame(dict(time=onsets)))

        elif event.key == '0':
            # Reset onsets with "logcqt_onsets"
            logger.debug("Getting hll_onsets()")
            onsets = S.hll_onsets(self.audio_file)
            self.set_onset_data(pd.DataFrame(dict(time=onsets)))

        elif event.key == 'left':
            # Shift all markers to the left (subtract) by 10ms/.01s
            self.shift_onsets(-.01)

        elif event.key == 'right':
            # Shift all markers to the left (subtract) by 10ms/.01s
            self.shift_onsets(.01)

        elif event.key == 'up':
            # Shift all markers to the left (subtract) by 100ms/.1s
            self.shift_onsets(-.1)

        elif event.key == 'down':
            # Shift all markers to the left (subtract) by 100ms/.1s
            self.shift_onsets(.1)
Esempio n. 7
0
    if args.ignore_no_instrument:
        dframe = dframe.loc[dframe['instrument'].dropna().index]

    # Get the data directory by finding the common prefix from the onsests_file
    # Assumes that all onsets files live in the same place!
    data_dir = set(dframe['onsets_file'].dropna().apply(
        os.path.dirname).tolist()).pop()

    if not args.random and not args.index:
        marked_idxs = []
        completed_idxs = []

        count = 0
        for n, (idx, row) in enumerate(dframe.iterrows()):
            if int(args.startat) >= 0 and n < args.startat:
                logger.info("Skipping {}".format(idx))
                continue

            quit, marked = annotate_one(row, data_dir,
                                        skip_existing=args.skip_existing)

            if quit:
                logger.info("Application Exiting...")
                break
            if marked:
                marked_idxs.append(idx)
            else:
                completed_idxs.append(idx)

            logger.info(utils.colorize("Completed {} of {}".format(
                count, len(dframe))))
def audio_collection_to_observations(segment_index_file, note_index_file,
                                     note_audio_dir, limit_n_files=None,
                                     note_duration=None):
    """
    Parameters
    ----------
    segment_index_file : str
        Input file containing all pointers to audio files and
        onsets files.

    note_index_file: str
        Path to the output index file which will contain pointers
        to the output note audio, and the metadata relating to it.

    note_audio_dir : str
        Path to store the resulting audio file.

    Returns
    -------
    success : bool
        True if the method completed as expected.
    """
    logger.info("Begin audio collection segmentation")
    logger.debug("Loading segment index")
    segment_df = pd.read_csv(segment_index_file, index_col=0)
    logger.debug("loaded {} records.".format(len(segment_df)))

    if segment_df.empty:
        logger.warning(utils.colorize(
            "No data available in {}; exiting.".format(segment_index_file),
            color='red'))
        # Here, we sys.exit 0 so the makefile will continue to build
        # other datasets, even if this one
        return True

    # Drop rows that do not have onsets_files.
    segment_df = segment_df.loc[segment_df.onsets_file.dropna().index]
    utils.create_directory(note_audio_dir)
    count = 0
    observations = []
    for idx, row in segment_df.iterrows():
        if pd.isnull(row.onsets_file):
            logger.warning("No onset file for {} [{}]; moving on.".format(
                row.audio_file, row.dataset))
            continue
        observations += audio_to_observations(
            idx, row.audio_file, row.onsets_file, note_audio_dir,
            file_ext='flac', dataset=row.dataset, instrument=row.instrument,
            dynamic=row.dynamic, note_duration=note_duration)
        logger.debug("Generated {} observations ({} of {}).".format(
            len(observations), (count + 1), len(segment_df)))

        if PRINT_PROGRESS:
            print("Progress: {:0.1f}% ({} of {})\r".format(
                (((count + 1) / float(len(segment_df))) * 100.),
                (count + 1), len(segment_df)), end='')
            sys.stdout.flush()
        count += 1

        if limit_n_files and count >= limit_n_files:
            break

    if PRINT_PROGRESS:
        print()

    collection = model.Collection(observations)
    collection.to_dataframe().to_csv(note_index_file)
    logger.debug("Wrote note index to {} with {} records".format(
        note_index_file, len(collection)))
    logger.info("Completed audio collection segmentation")
    return os.path.exists(note_index_file)
Esempio n. 9
0
def create_example_dataset(destination_dir, source_indexes, note_audio_dir,
                           n_per_instrument, output_index="master_index.csv",
                           partition_index_fmt="{}_test_partition.csv",
                           train_val_split=0.2):
    """Copy `n_per_instrument` from each instrument class in source_indexes
    to destination_dir, and create a new index file at the destination.

    Parameters
    ----------
    destination_dir : str
        Output path for writing data.

    source_indexes : list of str
        Set of index paths to use.

    note_audio_dir : str
        Directory of source audio files.

    n_per_instrument : int
        Number of observations to sample per instrument.

    output_index : str
        Basename of the output index to write.

    Returns
    -------
    success : bool
        True if the process completed successfully.
    """
    logger.info("create_example_dataset({}, {}, n_per_instrument={})".format(
        destination_dir, source_indexes, n_per_instrument))

    boltons.fileutils.mkdir_p(destination_dir)
    boltons.fileutils.mkdir_p(note_audio_dir)

    dframe = join_dataframes(source_indexes)
    dframe = minst.taxonomy.normalize_instrument_names(dframe)

    indexes = []
    values = []
    for dataset in dframe.dataset.unique():
        dset_df = dframe[dframe.dataset == dataset]
        logger.info("Dataset: {} Available Notes:\n {}"
                    .format(dataset, dset_df['instrument'].value_counts()))

        # TODO: only use accepted instrument types
        for instrument_type in dset_df.instrument.unique():
            inst_df = dset_df[dset_df.instrument == instrument_type]
            records = inst_df.sample(n_per_instrument)
            for idx, row in records.iterrows():

                shutil.copy(os.path.join(note_audio_dir, row['audio_file']),
                            os.path.join(destination_dir, row['audio_file']))
                indexes.append(idx)
                values.append(row)

    result_df = pd.DataFrame(values, index=indexes)
    output_file = os.path.join(destination_dir, output_index)
    result_df.to_csv(output_file)
    logger.info("Copied {} files to {}"
                .format(len(result_df), destination_dir))

    success = True
    if len(result_df.dataset.unique()) < 2:
        raise ValueError("Need more datasets for partitioning!")

    for test_set in result_df.dataset.unique():
        partition_index = os.path.join(destination_dir,
                                       partition_index_fmt.format(test_set))
        success &= train_test_split(output_file, test_set, train_val_split,
                                    partition_index)
    return os.path.exists(output_file) and success
Esempio n. 10
0
                                    partition_index)
    return os.path.exists(output_file) and success


if __name__ == "__main__":
    arguments = docopt(__doc__)

    level = 'INFO' if not arguments.get('--verbose') else 'DEBUG'
    logging.config.dictConfig(minst.logger.get_config(level))

    logger.debug(arguments)

    t0 = time.time()
    if arguments['join']:
        join_note_files(arguments['<sources>'], arguments['--output'])
    elif arguments['split']:
        train_test_split(arguments['<source_index>'][0],
                         # the above requires the [0] because we use
                         # source_index as a list for examples...
                         arguments['<test_set>'],
                         float(arguments['<train_val_split>']),
                         arguments['<output>'])
    elif arguments['example']:
        create_example_dataset(
            arguments['<destination_dir>'],
            arguments['<source_index>'],
            arguments['<note_audio_dir>'],
            int(arguments['--n_per_instrument']))
    t_end = time.time()
    logger.info("manage_dataset.py completed in: {}s".format(t_end - t0))
Esempio n. 11
0
def create_example_dataset(destination_dir,
                           source_indexes,
                           note_audio_dir,
                           n_per_instrument,
                           output_index="master_index.csv",
                           partition_index_fmt="{}_test_partition.csv",
                           train_val_split=0.2):
    """Copy `n_per_instrument` from each instrument class in source_indexes
    to destination_dir, and create a new index file at the destination.

    Parameters
    ----------
    destination_dir : str
        Output path for writing data.

    source_indexes : list of str
        Set of index paths to use.

    note_audio_dir : str
        Directory of source audio files.

    n_per_instrument : int
        Number of observations to sample per instrument.

    output_index : str
        Basename of the output index to write.

    Returns
    -------
    success : bool
        True if the process completed successfully.
    """
    logger.info("create_example_dataset({}, {}, n_per_instrument={})".format(
        destination_dir, source_indexes, n_per_instrument))

    boltons.fileutils.mkdir_p(destination_dir)
    boltons.fileutils.mkdir_p(note_audio_dir)

    dframe = join_dataframes(source_indexes)
    dframe = minst.taxonomy.normalize_instrument_names(dframe)

    indexes = []
    values = []
    for dataset in dframe.dataset.unique():
        dset_df = dframe[dframe.dataset == dataset]
        logger.info("Dataset: {} Available Notes:\n {}".format(
            dataset, dset_df['instrument'].value_counts()))

        # TODO: only use accepted instrument types
        for instrument_type in dset_df.instrument.unique():
            inst_df = dset_df[dset_df.instrument == instrument_type]
            records = inst_df.sample(n_per_instrument)
            for idx, row in records.iterrows():

                shutil.copy(os.path.join(note_audio_dir, row['audio_file']),
                            os.path.join(destination_dir, row['audio_file']))
                indexes.append(idx)
                values.append(row)

    result_df = pd.DataFrame(values, index=indexes)
    output_file = os.path.join(destination_dir, output_index)
    result_df.to_csv(output_file)
    logger.info("Copied {} files to {}".format(len(result_df),
                                               destination_dir))

    success = True
    if len(result_df.dataset.unique()) < 2:
        raise ValueError("Need more datasets for partitioning!")

    for test_set in result_df.dataset.unique():
        partition_index = os.path.join(destination_dir,
                                       partition_index_fmt.format(test_set))
        success &= train_test_split(output_file, test_set, train_val_split,
                                    partition_index)
    return os.path.exists(output_file) and success
Esempio n. 12
0
                                    partition_index)
    return os.path.exists(output_file) and success


if __name__ == "__main__":
    arguments = docopt(__doc__)

    level = 'INFO' if not arguments.get('--verbose') else 'DEBUG'
    logging.config.dictConfig(minst.logger.get_config(level))

    logger.debug(arguments)

    t0 = time.time()
    if arguments['join']:
        join_note_files(arguments['<sources>'], arguments['--output'])
    elif arguments['split']:
        train_test_split(
            arguments['<source_index>'][0],
            # the above requires the [0] because we use
            # source_index as a list for examples...
            arguments['<test_set>'],
            float(arguments['<train_val_split>']),
            arguments['<output>'])
    elif arguments['example']:
        create_example_dataset(arguments['<destination_dir>'],
                               arguments['<source_index>'],
                               arguments['<note_audio_dir>'],
                               int(arguments['--n_per_instrument']))
    t_end = time.time()
    logger.info("manage_dataset.py completed in: {}s".format(t_end - t0))