Esempio n. 1
0
    def __init__(self, path='datasets/Nottingham',
                 source='http://www-etud.iro.umontreal.ca/~boulanni/Nottingham.zip',
                 train_filter='.*train.*',
                 valid_filter='.*valid.*',
                 test_filter='.*test.*', ):

        super(Nottingham, self).__init__(path=path, source=source,
                                       train_filter=train_filter,
                                       valid_filter=valid_filter,
                                       test_filter=test_filter)

        # grab the datasets from midireading the files
        train_datasets = [
            midiread(f, r=(21, 109), dt=0.3).piano_roll.astype(config.floatX)
            for f in find_files(self.path, train_filter)
            ]
        valid_datasets = [
            midiread(f, r=(21, 109), dt=0.3).piano_roll.astype(config.floatX)
            for f in find_files(self.path, valid_filter)
            ]
        test_datasets = [
            midiread(f, r=(21, 109), dt=0.3).piano_roll.astype(config.floatX)
            for f in find_files(self.path, test_filter)
            ]

        self.train_inputs = numpy.concatenate(train_datasets)
        self.train_targets = None

        self.valid_inputs = numpy.concatenate(valid_datasets)
        self.valid_targets = None

        self.test_inputs = numpy.concatenate(test_datasets)
        self.test_targets = None
Esempio n. 2
0
    def __init__(self, path='datasets/MuseData',
                 source='http://www-etud.iro.umontreal.ca/~boulanni/MuseData.zip',
                 train_filter='.*train.*',
                 valid_filter='.*valid.*',
                 test_filter='.*test.*', ):

        super(MuseData, self).__init__(path=path, source=source,
                                       train_filter=train_filter,
                                       valid_filter=valid_filter,
                                       test_filter=test_filter)

        # grab the datasets from midireading the files
        train_datasets = [
            midiread(f, r=(21, 109), dt=0.3).piano_roll.astype(theano.config.floatX)
            for f in find_files(self.path, train_filter)
            ]
        valid_datasets = [
            midiread(f, r=(21, 109), dt=0.3).piano_roll.astype(theano.config.floatX)
            for f in find_files(self.path, valid_filter)
            ]
        test_datasets = [
            midiread(f, r=(21, 109), dt=0.3).piano_roll.astype(theano.config.floatX)
            for f in find_files(self.path, test_filter)
            ]

        self.train_inputs = numpy.concatenate(train_datasets)
        self.train_targets = None

        self.valid_inputs = numpy.concatenate(valid_datasets)
        self.valid_targets = None

        self.test_inputs = numpy.concatenate(test_datasets)
        self.test_targets = None
Esempio n. 3
0
 def __iter__(self):
     for fname in files.find_files(self.path, self.filter):
         if self.preprocess is not None and callable(self.preprocess):
             fname = self.preprocess(fname)
         fnames = raise_to_list(fname)
         for name in fnames:
             yield name
Esempio n. 4
0
 def __iter__(self):
     for fname in find_files(self.path, self.filter):
         if self.preprocess is not None and callable(self.preprocess):
             fname = self.preprocess(fname)
         fnames = raise_to_list(fname)
         for name in fnames:
             yield name
Esempio n. 5
0
 def __iter__(self):
     for fname in files.find_files(self.path, self.filter):
         try:
             with Image.open(fname) as im:
                 data = numpy.array(im)
                 if self.preprocess is not None and callable(self.preprocess):
                     data = self.preprocess(data)
                 data = raise_to_list(data)
                 for d in data:
                     yield d
         except Exception as err:
             log.exception(err.__str__())
Esempio n. 6
0
 def __iter__(self):
     idx = 0
     for fname in files.find_files(self.path, self.filter):
         with open(fname, 'r') as f:
             for line in f:
                 if self.preprocess is not None:
                     line = self.preprocess(line)
                 line = raise_to_list(line)
                 for token in line:
                     if idx >= self.n_future:
                         yield token
                     else:
                         idx += 1
Esempio n. 7
0
 def __iter__(self):
     idx = 0
     for fname in files.find_files(self.path, self.filter):
         with open(fname, 'r') as f:
             for line in f:
                 if self.preprocess is not None:
                     line = self.preprocess(line)
                 line = raise_to_list(line)
                 for token in line:
                     if idx >= self.n_future:
                         yield token
                     else:
                         idx += 1
Esempio n. 8
0
 def __iter__(self):
     for fname in find_files(self.path, self.filter):
         try:
             with Image.open(fname) as im:
                 data = numpy.array(im)
                 if self.preprocess is not None and callable(
                         self.preprocess):
                     data = self.preprocess(data)
                 data = raise_to_list(data)
                 for d in data:
                     yield d
         except Exception as err:
             _log.exception(err.__str__())
Esempio n. 9
0
 def __iter__(self):
     idx = 0
     for fname in files.find_files(self.path, self.filter):
         try:
             with open(fname, 'r') as f:
                 for line in f:
                     if self.preprocess is not None and callable(self.preprocess):
                         line = self.preprocess(line)
                     line = raise_to_list(line)
                     for token in line:
                         if idx >= self.n_future:
                             yield token
                         else:
                             idx += 1
         except Exception as err:
             log.exception(err.__str__())
Esempio n. 10
0
 def __iter__(self):
     idx = 0
     for fname in find_files(self.path, self.filter):
         try:
             with open(fname, 'r') as f:
                 for line in f:
                     if self.preprocess is not None and callable(
                             self.preprocess):
                         line = self.preprocess(line)
                     line = raise_to_list(line)
                     for token in line:
                         if idx >= self.n_future:
                             yield token
                         else:
                             idx += 1
         except Exception as err:
             _log.exception(err.__str__())
Esempio n. 11
0
 def __init__(
     self,
     path=DEFAULT_TEDLIUM_DATASET_PATH,
     window_duration = 0.01,
     skip_count = 1,
     max_speeches = None,
 ):
     """Initialize the Dataset with a given storage for TEDLIUM
     
     path -- target path for the TED LIUM data storage
     window_duration -- duration of the audio window in seconds
     skip_count -- step size across the segments in the repo
                   used to do a very small subset of the dataset 
                   when doing testing iterations. This allows you
                   to test an "epoch" across a small subset of the 
                   40GB data-file
     """
     self.window_size = 2**int(math.ceil(math.log(int(window_duration * 16000),2)))
     source_filename = path + '.tar.gz'
     if not os.path.exists(path):
         if os.path.exists(source_filename):
             # Note: this could, in theory overwrite anything on disk, as the Python
             # tarfile module doesn't prevent writing outside the root directory
             # (according to its docs).
             file_ops.untar(source_filename, destination_dir=os.path.dirname(path))
     if not os.path.exists(path):
         raise RuntimeError(
             "You need to download the TEDLIUM corpus (v2) from %(url)s and save it to %(path)s"%{
                 'url': LIUM_BASE + TEDLIUM_DOWNLOAD_URL,
                 'path': source_filename,
             }
         )
     path = os.path.realpath(path)
     log.info("Searching for speeches")
     self.train_speeches = [
         tedlium.Speech( sph, window_size=self.window_size )
         for sph in file_ops.find_files(
             path, '.*[/]train[/]sph[/].*[.]sph',
         )
     ]
     if max_speeches:
         self.train_speeches = self.train_speeches[:max_speeches]
     self.test_speeches = [
         tedlium.Speech( sph, window_size=self.window_size )
         for sph in file_ops.find_files(
             path, '.*[/]test[/]sph[/].*[.]sph',
         )
     ]
     if max_speeches:
         self.test_speeches = self.test_speeches[:max_speeches]
     self.valid_speeches = [
         tedlium.Speech( sph, window_size=self.window_size )
         for sph in file_ops.find_files(
             path, '.*[/]dev[/]sph[/].*[.]sph',
         )
     ]
     if max_speeches:
         self.valid_speeches = self.valid_speeches[:max_speeches]
     log.info(
         "Creating speech segments (utterance records using 1/%s of the utterances)",
         skip_count,
     )
     train_inputs,train_targets = inputs_and_targets( self.train_speeches )
     valid_inputs,valid_targets = inputs_and_targets( self.valid_speeches )
     test_inputs,test_targets = inputs_and_targets( self.test_speeches )
     log.info("Initializing the OpenDeep dataset")
     super(TEDLIUMDataset,self).__init__(
         train_inputs=train_inputs,train_targets=train_targets,
         valid_inputs=valid_inputs,valid_targets=valid_targets,
         test_inputs=test_inputs,test_targets=test_targets,
     )