def get_data(**kwargs): ''' Returns data from audio tracks ''' if os.path.exists(DATAFILE_MAT): Data = GroupXData.LoadFromFile(DATAFILE_MAT) else: obs = [] doc_range = [0] count = 0 with h5py.File('../tracks.h5', 'r') as tracks: for track, grp in ProgressBar(tracks.items()): if 'gfccs' not in grp: continue data = grp['gfccs'] count += data.shape[0] doc_range.append(count) obs.append(data.value.astype(np.float64)) X = np.vstack(obs) Data = GroupXData(X=X, doc_range=doc_range) Data.save_to_mat(DATAFILE_MAT) Data.name = 'AudioCorpus' Data.summary = 'Audio Corpus. obs=10.5M docs=559' return Data
def loadDataForBatch(self, batchID): ''' Load the data assigned to a particular batch Returns ------- Dchunk : bnpy.data.DataObj subclass ''' dpath = self.datafileList[batchID] if dpath.endswith('.ldac'): return BagOfWordsData.LoadFromFile_ldac(dpath, **self.DataInfo) elif self.dataset_type == 'GroupXData': return GroupXData.LoadFromFile(dpath, **self.DataInfo) else: return XData.read_file(dpath, **self.DataInfo)
def loadDataForSlice(filepath='', dataset_type='', **kwargs): """ Return data object loaded from specific file. Keyword args ------------ workerID nWorkers """ if filepath.endswith('.ldac'): return BagOfWordsData.LoadFromFile_ldac(filepath, **kwargs) else: if dataset_type == 'GroupXData': return GroupXData.LoadFromFile(filepath, **kwargs) else: return XData.LoadFromFile(filepath, **kwargs)