def test_save(): # Empty content ListDictContainer( {}).save(filename=os.path.join(tempfile.gettempdir(), 'saved.yaml')) # Content data = [ { 'key1': 100, 'key2': 402.2, }, { 'key1': 200, 'key2': 302.2, }, { 'key1': 300, 'key2': 202.3, }, { 'key1': 400, 'key2': 101.2, }, ] d = ListDictContainer(data, filename=os.path.join(tempfile.gettempdir(), 'saved.yaml')).save().load() nose.tools.assert_list_equal(d, data) d = ListDictContainer(data, filename=os.path.join(tempfile.gettempdir(), 'saved.csv')).save().load( fields=['key1', 'key2']) nose.tools.assert_list_equal(d, data) d = ListDictContainer( data, filename=os.path.join(tempfile.gettempdir(), 'saved.csv')).save( fields=['key1', 'key2']).load(fields=['key1', 'key2']) nose.tools.assert_list_equal(d, data) d = ListDictContainer( data, filename=os.path.join(tempfile.gettempdir(), 'saved.cpickle')).save().load() nose.tools.assert_list_equal(d, data)
def test_container(): data = ListDictContainer([ { 'key1': 100, 'key2': 400, }, { 'key1': 200, 'key2': 300, }, { 'key1': 300, 'key2': 200, }, { 'key1': 400, 'key2': 100, }, ]) column = data.get_field(field_name='key1') nose.tools.eq_(column, [100, 200, 300, 400]) column = data.get_field(field_name='key2') nose.tools.eq_(column, [400, 300, 200, 100]) nose.tools.eq_(data.search(key='key1', value=100), { 'key1': 100, 'key2': 400 }) nose.tools.eq_(data.search(key='key1', value=123), None)
def parse(self, recipe): """Parse feature vector recipe Overall format: [block #1];[block #2];[block #3];... Block formats: - [label (string)]=full vector - [label (string)]=[start index (int)]-[end index (int)] => default stream and vector [start:end] - [label (string)]=[stream (int or string)]:[start index (int)]-[end index (int)] => specified stream and vector [start:end] - [label (string)]=1,2,3,4,5 => vector [1,2,3,4,4] - [label (string)]=0 => specified stream and full vector Parameters ---------- recipe : str Feature recipe Returns ------- data : dict Feature recipe structure """ if isinstance(recipe, six.string_types): data = [] labels = recipe.split(self.delimiters['block']) for label in labels: label = label.strip() if label: detail_parts = label.split(self.delimiters['detail']) label = detail_parts[0].strip() # Default values, used when only extractor is defined e.g. [extractor (string)]; [extractor (string)] vector_index_structure = { 'stream': self.default_stream, 'selection': False, 'full': True, } # Inspect recipe further if len(detail_parts) == 2: main_index_parts = detail_parts[1].split( self.delimiters['dimension']) vector_indexing_string = detail_parts[1] if len(main_index_parts) > 1: # Channel has been defined, # e.g. [extractor (string)]=[channel (int)]:[start index (int)]-[end index (int)] vector_index_structure['stream'] = int( main_index_parts[0]) vector_indexing_string = main_index_parts[1] vector_indexing = vector_indexing_string.split( self.delimiters['segment']) if len(vector_indexing) > 1: vector_index_structure['start'] = int( vector_indexing[0].strip()) vector_index_structure['stop'] = int( vector_indexing[1].strip()) + 1 vector_index_structure['full'] = False vector_index_structure['selection'] = False else: vector_indexing = vector_indexing_string.split( self.delimiters['vector']) if len(vector_indexing) > 1: a = list(map(int, vector_indexing)) vector_index_structure['full'] = False vector_index_structure['selection'] = True vector_index_structure['vector'] = a else: vector_index_structure['stream'] = int( vector_indexing[0]) vector_index_structure['full'] = True vector_index_structure['selection'] = False current_data = { 'label': label, 'vector-index': vector_index_structure, } else: current_data = { 'label': label, } data.append(current_data) from dcase_util.containers import ListDictContainer return ListDictContainer(data) else: return recipe
def prepare(self): """Prepare dataset for the usage. Returns ------- self """ if not self.meta_container.exists(): scene_label = 'home' evaluation_chunks = ListDictContainer( filename=os.path.join(self.local_path, 'chime_home', 'evaluation_chunks_refined.csv')).load( fields=['id', 'filename', 'set_id']) audio_files = {} for item in dcase_cross_val_data: audio_filename = os.path.join( 'chime_home', 'chunks', item['filename'] + self.sample_mode + '.wav') annotation_filename = os.path.join('chime_home', 'chunks', item['filename'] + '.csv') if audio_filename not in audio_files: audio_files[audio_filename] = { 'audio': audio_filename, 'meta': annotation_filename } meta_data = MetaDataContainer() for audio_filename, data in iteritems(audio_files): current_meta_data = DictContainer(filename=os.path.join( self.local_path, data['meta'])).load() tags = [] for i, tag in enumerate(current_meta_data['majorityvote']): if tag != 'S' and tag != 'U': tags.append(self.tagcode_to_taglabel(tag)) name = os.path.split(audio_filename)[1] segment_name = name[0:name.find('_chunk')] chunk_name = name[name.find('_chunk') + 1:].split('.')[0] item = MetaDataItem({ 'filename': audio_filename, 'scene_label': scene_label, 'tags': ';'.join(tags) + ';', 'identifier': segment_name }) self.process_meta_item(item=item, absolute_path=False) meta_data.append(item) # Save meta meta_data.save(filename=self.meta_file) # Load meta and cross validation self.load() all_folds_found = True train_filename = self.evaluation_setup_filename(setup_part='train') test_filename = self.evaluation_setup_filename(setup_part='test') eval_filename = self.evaluation_setup_filename(setup_part='evaluate') if not os.path.isfile(train_filename): all_folds_found = False if not os.path.isfile(test_filename): all_folds_found = False if not os.path.isfile(eval_filename): all_folds_found = False if not all_folds_found: Path().makedirs(path=self.evaluation_setup_path) # Train train_filename = self.evaluation_setup_filename(setup_part='train') train_meta = MetaDataContainer(filename=train_filename) for filename in self.train_files(): train_meta.append(self.file_meta(filename)[0]) train_meta.save() # Test test_filename = self.evaluation_setup_filename(setup_part='test') test_meta = MetaDataContainer(filename=test_filename) for filename in self.test_files(): test_meta.append( MetaDataItem( {'filename': self.absolute_to_relative_path(filename)})) test_meta.save() # Evaluate eval_filename = self.evaluation_setup_filename( setup_part='evaluate') eval_meta = MetaDataContainer(filename=eval_filename) for filename in self.test_files(): eval_meta.append(self.file_meta(filename)[0]) eval_meta.save() # Load meta and cross validation self.load() return self
def prepare(self): """Prepare dataset for the usage. Returns ------- self """ if not self.meta_container.exists(): scene_label = 'home' dcase_cross_val_data = ListDictContainer(filename=os.path.join( self.local_path, 'chime_home', 'development_chunks_refined_crossval_dcase2016.csv')).load( fields=['id', 'filename', 'set_id']) audio_files = {} for item in dcase_cross_val_data: audio_filename = os.path.join( 'chime_home', 'chunks', item['filename'] + self.sample_mode + '.wav') annotation_filename = os.path.join('chime_home', 'chunks', item['filename'] + '.csv') if audio_filename not in audio_files: audio_files[audio_filename] = { 'audio': audio_filename, 'meta': annotation_filename } meta_data = MetaDataContainer() for audio_filename, data in iteritems(audio_files): current_meta_data = DictContainer(filename=os.path.join( self.local_path, data['meta'])).load() tags = [] for i, tag in enumerate(current_meta_data['majorityvote']): if tag != 'S' and tag != 'U': tags.append(self.tagcode_to_taglabel(tag)) name = os.path.split(audio_filename)[1] segment_name = name[0:name.find('_chunk')] chunk_name = name[name.find('_chunk') + 1:].split('.')[0] item = MetaDataItem({ 'filename': audio_filename, 'scene_label': scene_label, 'tags': ';'.join(tags) + ';', 'identifier': segment_name }) self.process_meta_item(item=item, absolute_path=False) meta_data.append(item) # Save meta meta_data.save(filename=self.meta_file) # Load meta and cross validation self.load() all_folds_found = True for fold in range(1, self.crossvalidation_folds + 1): train_filename = self.evaluation_setup_filename(setup_part='train', fold=fold) test_filename = self.evaluation_setup_filename(setup_part='test', fold=fold) eval_filename = self.evaluation_setup_filename( setup_part='evaluate', fold=fold) if not os.path.isfile(train_filename): all_folds_found = False if not os.path.isfile(test_filename): all_folds_found = False if not os.path.isfile(eval_filename): all_folds_found = False if not all_folds_found: Path().makedirs(path=self.evaluation_setup_path) dcase_crossval = { 1: [], 2: [], 3: [], 4: [], 5: [], } dcase_cross_val_data = ListDictContainer(filename=os.path.join( self.local_path, 'chime_home', 'development_chunks_refined_crossval_dcase2016.csv')).load( fields=['id', 'filename', 'set_id']) for item in dcase_cross_val_data: dcase_crossval[int(item['set_id']) + 1].append( self.relative_to_absolute_path( os.path.join( 'chime_home', 'chunks', item['filename'] + self.sample_mode + '.wav'))) for fold in range(1, self.crossvalidation_folds + 1): # Collect training and testing files train_files = [] for f in range(1, self.crossvalidation_folds + 1): if f is not fold: train_files += dcase_crossval[f] test_files = dcase_crossval[fold] # Create meta containers and save them # Train train_filename = self.evaluation_setup_filename( setup_part='train', fold=fold) train_meta = MetaDataContainer(filename=train_filename) for filename in train_files: item = self.file_meta(filename)[0] self.process_meta_item(item=item, absolute_path=False) train_meta.append(item) train_meta.save() # Test test_filename = self.evaluation_setup_filename( setup_part='test', fold=fold) test_meta = MetaDataContainer(filename=test_filename) for filename in test_files: item = MetaDataItem( {'filename': self.absolute_to_relative_path(filename)}) test_meta.append(item) test_meta.save() # Evaluate eval_filename = self.evaluation_setup_filename( setup_part='evaluate', fold=fold) eval_meta = MetaDataContainer(filename=eval_filename) for filename in test_files: item = self.file_meta(filename)[0] self.process_meta_item(item=item, absolute_path=False) eval_meta.append(item) eval_meta.save() # Load meta and cross validation self.load() return self
def extract_packages(self): """Extract the dataset packages Raises ------ IOError Local package was not found. Returns ------- self """ # Make sure evaluation_setup directory exists Path().makedirs( path=os.path.join(self.local_path, self.evaluation_setup_folder)) log = FancyLogger() item_access_log_filename = os.path.join(self.local_path, 'item_access_error.log.csv') if 'audio' in self.included_content_types or self.included_content_types == [ 'all' ]: # mean process audio log.title("Download_data") log.info( "Once database is downloaded, do not forget to check your missing_files" ) non_existing_videos = pandas.DataFrame( columns=["filename", "error"]) log.line("check files exist or download data") # Collect file ids for package in self.package_list: if package.get('content_type') == "meta": base_filepath = os.path.splitext( package.get('filename').split('/')[-1])[0] if 'train' in package.get('filename'): result_audio_directory = os.path.join( self.local_path, 'dataset/audio/train', base_filepath) else: result_audio_directory = os.path.join( self.local_path, 'dataset/audio/test') missing_files = download(package.get('filename'), result_audio_directory, n_jobs=3) if not missing_files.empty: non_existing_videos = non_existing_videos.append( missing_files, ignore_index=True) # Save list of non-accessible videos ListDictContainer(non_existing_videos.to_dict(orient="records"), filename=item_access_log_filename).save( fields=['filename', 'error']) # Evaluation setup filenames train_filename_fold1 = self.evaluation_setup_filename( setup_part='train', fold=1, file_extension='csv') test_filename_fold1 = self.evaluation_setup_filename( setup_part='test', fold=1, file_extension='csv') train_filename_fold2 = self.evaluation_setup_filename( setup_part='train', fold=2, file_extension='csv') test_filename_fold2 = self.evaluation_setup_filename( setup_part='test', fold=2, file_extension='csv') evaluate_filename = self.evaluation_setup_filename( setup_part='evaluate', fold=2, file_extension='csv') # Check that evaluation setup exists evaluation_setup_exists = True if not os.path.isfile(train_filename_fold1) or not os.path.isfile(test_filename_fold1) \ or not os.path.isfile(train_filename_fold2) or not os.path.isfile(test_filename_fold2) \ or not os.path.isfile(evaluate_filename) or not self.meta_container.exists(): evaluation_setup_exists = False if not evaluation_setup_exists: # Evaluation setup was not found, generate one item_access_log_filename = os.path.join( self.local_path, 'item_access_error.log.csv') non_existing_videos = ListDictContainer().load( filename=item_access_log_filename, delimiter=',').get_field_unique('filename') train_meta_weak_fold1 = MetaDataContainer() audio_path = 'dataset/audio/train/weak' for item in MetaDataContainer().load(os.path.join( self.local_path, 'dataset/metadata/train/' 'weak.csv'), fields=["filename", "tags"], csv_header=True): if item.filename not in non_existing_videos: if not item.filename.endswith( self.default_audio_extension): item.filename = os.path.join( audio_path, os.path.splitext(item.filename)[0] + '.' + self.default_audio_extension) else: item.filename = Path(path=item.filename).modify( path_base=audio_path) # Only collect items which exists if audio present if 'audio' in self.included_content_types or 'all' in self.included_content_types: if os.path.isfile( os.path.join(self.local_path, item.filename)): train_meta_weak_fold1.append(item) else: train_meta_weak_fold1.append(item) train_meta_weak_fold1.save(filename=train_filename_fold1, csv_header=True, file_format="CSV") test_meta_unlabel_fold1 = MetaDataContainer() audio_path = 'dataset/audio/train/unlabel_in_domain' for item in MetaDataContainer().load(os.path.join( self.local_path, 'dataset/metadata/train/' 'unlabel_in_domain.csv'), csv_header=True): if item.filename not in non_existing_videos: # If not the right extension, change it if not item.filename.endswith( self.default_audio_extension): item.filename = os.path.join( audio_path, os.path.splitext(item.filename)[0] + '.' + self.default_audio_extension) else: item.filename = Path(path=item.filename).modify( path_base=audio_path) # Only collect items which exists if audio present if 'audio' in self.included_content_types or 'all' in self.included_content_types: if os.path.isfile( os.path.join(self.local_path, item.filename)): test_meta_unlabel_fold1.append(item) else: test_meta_unlabel_fold1.append(item) test_meta_unlabel_fold1.save(filename=test_filename_fold1, csv_header=True, file_format="CSV") # Fold 2 train is all the data used in fold 1 train_meta_weak_fold2 = MetaDataContainer() train_meta_weak_fold2 += MetaDataContainer().load( train_filename_fold1, csv_header=True, file_format="CSV") for item in MetaDataContainer().load(test_filename_fold1, csv_header=True, file_format="CSV"): item.tags = [] train_meta_weak_fold2.append(item) train_meta_weak_fold2.save(filename=train_filename_fold2, csv_header=True) # Evaluate meta is the groundtruth file with test annotations test.csv evaluate_meta = MetaDataContainer() audio_path = 'dataset/audio/test' for item in MetaDataContainer().load(os.path.join( self.local_path, 'dataset/metadata/test/test.csv'), csv_header=True): if item.filename not in non_existing_videos: if not item.filename.endswith( self.default_audio_extension): item.filename = os.path.join( audio_path, os.path.splitext(item.filename)[0] + '.' + self.default_audio_extension) else: item.filename = Path(path=item.filename).modify( path_base=audio_path) # Only collect items which exists if 'audio' in self.included_content_types or 'all' in self.included_content_types: if os.path.isfile( os.path.join(self.local_path, item.filename)): evaluate_meta.append(item) else: evaluate_meta.append(item) evaluate_meta.save(filename=evaluate_filename, csv_header=True, file_format="CSV") # Test meta is filenames of evaluation, labels will be predicted test_meta_strong_fold2 = MetaDataContainer() for filename in evaluate_meta.unique_files: test_meta_strong_fold2.append( MetaDataItem({'filename': filename})) test_meta_strong_fold2.save(filename=test_filename_fold2, csv_header=True, file_format="CSV") # meta_data is the default meta container containing all files of the dataset meta_data = MetaDataContainer() meta_data += MetaDataContainer().load(train_filename_fold1, csv_header=True, file_format="CSV") meta_data += MetaDataContainer().load(test_filename_fold1, csv_header=True, file_format="CSV") meta_data += MetaDataContainer().load(test_filename_fold2, csv_header=True, file_format="CSV") # Save meta meta_data.save(filename=self.meta_file) log.foot() return self
def prepare(self): """Prepare dataset for the usage. Returns ------- self """ if is_jupyter(): from tqdm import tqdm_notebook as tqdm else: from tqdm import tqdm # Make sure audio directory exists Path().makedirs(path=os.path.join(self.local_path, 'audio')) # Make sure evaluation_setup directory exists Path().makedirs( path=os.path.join(self.local_path, self.evaluation_setup_folder)) if 'audio' in self.included_content_types: # Collect file ids files = [] files += ListDictContainer(filename=os.path.join( self.local_path, 'testing_set.csv')).load( fields=['query_id', 'segment_start', 'segment_end']) files += ListDictContainer(filename=os.path.join( self.local_path, 'training_set.csv')).load( fields=['query_id', 'segment_start', 'segment_end']) file_progress = tqdm(files, desc="{0: <25s}".format('Files'), file=sys.stdout, leave=False, disable=self.disable_progress_bar, ascii=self.use_ascii_progress_bar) non_existing_videos = {} # Load list of already identified non-accessible videos item_access_log_filename = os.path.join( self.local_path, 'item_access_error.log.csv') if os.path.isfile(item_access_log_filename): for item in ListDictContainer( filename=item_access_log_filename).load( fields=['query_id', 'error']): non_existing_videos[item['query_id']] = item # Check that audio files exists for file_data in file_progress: audio_filename = os.path.join( self.local_path, 'audio', 'Y{query_id}_{segment_start}_{segment_end}.{extension}'. format(query_id=file_data['query_id'], segment_start=file_data['segment_start'], segment_end=file_data['segment_end'], extension=self.default_audio_extension)) # Download segment if it does not exists if not os.path.isfile(audio_filename) and file_data[ 'query_id'] not in non_existing_videos: try: AudioContainer().load_from_youtube( query_id=file_data['query_id'], start=file_data['segment_start'], stop=file_data['segment_end']).save( filename=audio_filename) except IOError as e: non_existing_videos[file_data['query_id']] = { 'error': str(e.message).replace('\n', ' '), 'query_id': file_data['query_id'] } # Save list of non-accessible videos ListDictContainer(list(non_existing_videos.values()), filename=item_access_log_filename).save( fields=['query_id', 'error']) # Evaluation setup filenames train_filename = self.evaluation_setup_filename(setup_part='train', fold=1, scene_label='youtube', file_extension='txt') test_filename = self.evaluation_setup_filename(setup_part='test', fold=1, scene_label='youtube', file_extension='txt') evaluate_filename = self.evaluation_setup_filename( setup_part='evaluate', fold=1, scene_label='youtube', file_extension='txt') # Check that evaluation setup exists evaluation_setup_exists = True if not os.path.isfile(train_filename) or not os.path.isfile( test_filename) or not os.path.isfile(evaluate_filename): evaluation_setup_exists = False if not evaluation_setup_exists: # Evaluation setup was not found, generate one fold = 1 train_meta = MetaDataContainer() for item in MetaDataContainer().load( os.path.join(self.local_path, 'groundtruth_weak_label_training_set.csv')): if not item.filename.endswith(self.default_audio_extension): item.filename = os.path.join( 'audio', 'Y' + os.path.splitext(item.filename)[0] + '.' + self.default_audio_extension) # Set scene label item.scene_label = 'youtube' # Translate event onset and offset, weak labels item.offset -= item.onset item.onset -= item.onset # Only collect items which exists if audio present if 'audio' in self.included_content_types: if os.path.isfile( os.path.join(self.local_path, item.filename)): train_meta.append(item) else: train_meta.append(item) train_meta.save( filename=self.evaluation_setup_filename(setup_part='train', fold=fold, scene_label='youtube', file_extension='txt')) evaluate_meta = MetaDataContainer() for item in MetaDataContainer().load( os.path.join(self.local_path, 'groundtruth_strong_label_testing_set.csv')): if not item.filename.endswith(self.default_audio_extension): item.filename = os.path.join( 'audio', 'Y' + os.path.splitext(item.filename)[0] + '.' + self.default_audio_extension) # Set scene label item.scene_label = 'youtube' # Only collect items which exists if 'audio' in self.included_content_types: if os.path.isfile( os.path.join(self.local_path, item.filename)): evaluate_meta.append(item) else: evaluate_meta.append(item) evaluate_meta.save( filename=self.evaluation_setup_filename(setup_part='evaluate', fold=fold, scene_label='youtube', file_extension='txt')) test_meta = MetaDataContainer() for item in evaluate_meta: test_meta.append(MetaDataItem({'filename': item.filename})) test_meta.save( filename=self.evaluation_setup_filename(setup_part='test', fold=fold, scene_label='youtube', file_extension='txt')) # Load meta and cross validation self.load() if not self.meta_container.exists(): fold = 1 meta_data = MetaDataContainer() meta_data += MetaDataContainer().load( self.evaluation_setup_filename(setup_part='train', fold=fold, scene_label='youtube', file_extension='txt')) meta_data += MetaDataContainer().load( self.evaluation_setup_filename(setup_part='evaluate', fold=fold, scene_label='youtube', file_extension='txt')) # Save meta meta_data.save(filename=self.meta_file) # Load meta and cross validation self.load() return self
def test_load_wrong_type(): with dcase_util.utils.DisableLogger(): ListDictContainer().load( filename=os.path.join(tempfile.gettempdir(), 'wrong.cpickle'))
def test_load_not_found2(): with dcase_util.utils.DisableLogger(): ListDictContainer().load( filename=os.path.join(tempfile.gettempdir(), 'wrong.txt'))