コード例 #1
0
ファイル: mixins.py プロジェクト: dby124/dcase_util
    def compress(self,
                 filename=None,
                 path=None,
                 file_list=None,
                 size_limit=None,
                 overwrite=False):
        """Compress the package. Supports Zip and Tar packages.

        Parameters
        ----------
        file_list : list of dict

        size_limit : int
            Default value None

        overwrite : bool
            Overwrite existing package.
            Default value False

        Returns
        -------
        self

        """

        if filename is not None:
            self.filename = filename
            self.detect_file_format()
            self.validate_format()

        if path is not None and file_list is None:
            files = Path(path=path).file_list(recursive=True)
            file_list = []
            for file in files:
                file_list.append({
                    'source': file,
                    'target': os.path.relpath(file)
                })

        if size_limit is None:
            package = None

            if self.format == FileFormat.ZIP:
                package = zipfile.ZipFile(file=self.filename, mode='w')

            elif self.format == FileFormat.TAR:
                package = tarfile.open(name=self.filename, mode='w:gz')

            size_uncompressed = 0
            for item in file_list:
                if os.path.exists(item['source']):
                    if self.format == FileFormat.ZIP:
                        package.write(filename=item['source'],
                                      arcname=os.path.relpath(item['target']),
                                      compress_type=zipfile.ZIP_DEFLATED)
                        file_info = package.getinfo(
                            os.path.relpath(item['target']))
                        size_uncompressed += file_info.file_size

                    elif self.format == FileFormat.TAR:
                        package.add(name=item['source'],
                                    arcname=os.path.relpath(item['target']))
                        file_info = package.gettarinfo(name=item['source'],
                                                       arcname=os.path.relpath(
                                                           item['target']))
                        size_uncompressed += file_info.size

                else:
                    package.close()
                    message = '{name}: Non-existing file [{filename}] detected while compressing a package [{package}]'.format(
                        name=self.__class__.__name__,
                        filename=item['source'],
                        package=self.filename)
                    if self.logger:
                        self.logger.exception(message)

                    raise IOError(message)

            package.close()

        else:
            base, extension = os.path.splitext(self.filename)
            filename_template = base + '.{package_id}' + extension

            # Initialize package
            package_id = 1

            size_uncompressed = 0
            if self.format == FileFormat.ZIP:
                package = zipfile.ZipFile(
                    file=filename_template.format(package_id=package_id),
                    mode='w')

            elif self.format == FileFormat.TAR:
                package = tarfile.open(
                    name=filename_template.format(package_id=package_id),
                    mode='w:gz')

            progress = tqdm(file_list,
                            desc="{0: <25s}".format('Compress'),
                            file=sys.stdout,
                            leave=False,
                            disable=self.disable_progress_bar,
                            ascii=self.use_ascii_progress_bar)

            for item_id, item in enumerate(progress):
                if self.disable_progress_bar:
                    self.logger.info(
                        '  {title:<15s} [{item_id:d}/{total:d}] {file:<30s}'.
                        format(title='Compress ',
                               item_id=item_id,
                               total=len(progress),
                               file=item['source']))

                if os.path.exists(item['source']):
                    current_size_uncompressed = os.path.getsize(item['source'])
                    if size_uncompressed + current_size_uncompressed > size_limit:
                        # Size limit met, close current package and open a new one.
                        package.close()

                        package_id += 1
                        if self.format == FileFormat.ZIP:
                            package = zipfile.ZipFile(
                                file=filename_template.format(
                                    package_id=package_id),
                                mode='w')

                        elif self.format == FileFormat.TAR:
                            package = tarfile.open(
                                name=filename_template.format(
                                    package_id=package_id),
                                mode='w:gz')

                        size_uncompressed = 0
                    if self.format == FileFormat.ZIP:
                        package.write(filename=item['source'],
                                      arcname=os.path.relpath(item['target']),
                                      compress_type=zipfile.ZIP_DEFLATED)

                        file_info = package.getinfo(
                            os.path.relpath(item['target']))
                        size_uncompressed += file_info.file_size

                    elif self.format == FileFormat.TAR:
                        package.add(name=item['source'],
                                    arcname=os.path.relpath(item['target']))
                        file_info = package.gettarinfo(name=item['source'],
                                                       arcname=os.path.relpath(
                                                           item['target']))
                        size_uncompressed += file_info.size

                else:
                    package.close()
                    message = '{name}: Non-existing file [{filename}] detected while compressing a package [{package}]'.format(
                        name=self.__class__.__name__,
                        filename=item['source'],
                        package=filename_template.format(
                            package_id=package_id))
                    if self.logger:
                        self.logger.exception(message)

                    raise IOError(message)

            package.close()
コード例 #2
0
    def compress(self,
                 filename=None,
                 path=None,
                 file_list=None,
                 size_limit=None):
        """Compress the package. Supports Zip and Tar packages.

        Parameters
        ----------
        filename : str
            Filename for the package. If None given, one given to class initializer is used.
            Default value None

        path : str
            Path get files if file_list is not set. Files are collected recursively.
            Default value None

        file_list : list of dict
            List of files to be included to the package.
            Item format {'source': 'file1.txt', 'target': 'folder1/file1.txt'}.
            Default value None

        size_limit : int
            Size limit in bytes.
            Default value None

        Returns
        -------
        list of str
            Filenames of created packages

        """

        if is_jupyter():
            from tqdm import tqdm_notebook as tqdm
        else:
            from tqdm import tqdm

        if filename is not None:
            self.filename = filename
            self.detect_file_format()
            self.validate_format()

        if path is not None and file_list is None:
            files = Path(path=path).file_list(recursive=True)
            file_list = []

            for filename in files:
                file_list.append({
                    'source': filename,
                    'target': os.path.relpath(filename)
                })

        package_filenames = []

        total_uncompressed_size = 0
        for item in file_list:
            total_uncompressed_size += os.path.getsize(item['source'])

        if size_limit is None or total_uncompressed_size < size_limit:
            package = None

            if self.format == FileFormat.ZIP:
                package = zipfile.ZipFile(file=self.filename, mode='w')

            elif self.format == FileFormat.TAR:
                package = tarfile.open(name=self.filename, mode='w:gz')

            package_filenames.append(self.filename)

            size_uncompressed = 0
            for item in file_list:
                if os.path.exists(item['source']):
                    if self.format == FileFormat.ZIP:
                        package.write(filename=item['source'],
                                      arcname=os.path.relpath(item['target']),
                                      compress_type=zipfile.ZIP_DEFLATED)
                        file_info = package.getinfo(
                            os.path.relpath(item['target']))
                        size_uncompressed += file_info.file_size

                    elif self.format == FileFormat.TAR:
                        package.add(name=item['source'],
                                    arcname=os.path.relpath(item['target']))
                        file_info = package.gettarinfo(name=item['source'],
                                                       arcname=os.path.relpath(
                                                           item['target']))
                        size_uncompressed += file_info.size

                else:
                    package.close()
                    message = '{name}: Non-existing file [{filename}] detected while compressing a package [{package}]'.format(
                        name=self.__class__.__name__,
                        filename=item['source'],
                        package=self.filename)
                    if self.logger:
                        self.logger.exception(message)

                    raise IOError(message)

            package.close()

        else:
            base, extension = os.path.splitext(self.filename)
            filename_template = base + '.{package_id}' + extension
            package = None

            # Initialize package
            package_id = 1

            size_uncompressed = 0
            if self.format == FileFormat.ZIP:
                package = zipfile.ZipFile(
                    file=filename_template.format(package_id=package_id),
                    mode='w')

            elif self.format == FileFormat.TAR:
                package = tarfile.open(
                    name=filename_template.format(package_id=package_id),
                    mode='w:gz')

            package_filenames.append(
                filename_template.format(package_id=package_id))

            progress = tqdm(file_list,
                            desc="{0: <25s}".format('Compress'),
                            file=sys.stdout,
                            leave=False,
                            disable=self.disable_progress_bar,
                            ascii=self.use_ascii_progress_bar)

            for item_id, item in enumerate(progress):
                if self.disable_progress_bar:
                    self.logger.info(
                        '  {title:<15s} [{item_id:d}/{total:d}] {file:<30s}'.
                        format(title='Compress ',
                               item_id=item_id,
                               total=len(progress),
                               file=item['source']))

                if os.path.exists(item['source']):
                    current_size_uncompressed = os.path.getsize(item['source'])
                    if size_uncompressed + current_size_uncompressed > size_limit:
                        # Size limit met, close current package and open a new one.
                        package.close()

                        package_id += 1
                        if self.format == FileFormat.ZIP:
                            package = zipfile.ZipFile(
                                file=filename_template.format(
                                    package_id=package_id),
                                mode='w')

                        elif self.format == FileFormat.TAR:
                            package = tarfile.open(
                                name=filename_template.format(
                                    package_id=package_id),
                                mode='w:gz')

                        package_filenames.append(
                            filename_template.format(package_id=package_id))

                        size_uncompressed = 0

                    if self.format == FileFormat.ZIP:
                        package.write(filename=item['source'],
                                      arcname=os.path.relpath(item['target']),
                                      compress_type=zipfile.ZIP_DEFLATED)

                        file_info = package.getinfo(
                            os.path.relpath(item['target']))
                        size_uncompressed += file_info.file_size

                    elif self.format == FileFormat.TAR:
                        package.add(name=item['source'],
                                    arcname=os.path.relpath(item['target']))
                        file_info = package.gettarinfo(name=item['source'],
                                                       arcname=os.path.relpath(
                                                           item['target']))
                        size_uncompressed += file_info.size

                else:
                    package.close()
                    message = '{name}: Non-existing file [{filename}] detected while compressing a package [{package}]'.format(
                        name=self.__class__.__name__,
                        filename=item['source'],
                        package=filename_template.format(
                            package_id=package_id))

                    if self.logger:
                        self.logger.exception(message)

                    raise IOError(message)

            package.close()

        return package_filenames
コード例 #3
0
ファイル: mixins.py プロジェクト: dby124/dcase_util
    def extract(self,
                target_path=None,
                overwrite=False,
                omit_first_level=False):
        """Extract the package. Supports Zip and Tar packages.

        Parameters
        ----------
        target_path : str
            Path to extract the package content. If none given, package is extracted in the same path than package.
            Default value None

        overwrite : bool
            Overwrite existing files.
            Default value False

        omit_first_level : bool
            Omit first directory level.
            Default value True

        Returns
        -------
        self

        """

        if target_path is None:
            target_path = os.path.split(self.filename)[0]

        Path(target_path).create()

        if self.format == FileFormat.ZIP:
            with zipfile.ZipFile(self.filename, "r") as z:
                if omit_first_level:
                    parts = []
                    for name in z.namelist():
                        if not name.endswith('/'):
                            parts.append(name.split('/')[:-1])

                    prefix = os.path.commonprefix(parts) or ''

                    if prefix:
                        if len(prefix) > 1:
                            prefix_ = list()
                            prefix_.append(prefix[0])
                            prefix = prefix_

                        prefix = '/'.join(prefix) + '/'
                    offset = len(prefix)

                # Start extraction
                members = z.infolist()
                file_count = 1
                progress = tqdm(members,
                                desc="{0: <25s}".format('Extract'),
                                file=sys.stdout,
                                leave=False,
                                disable=self.disable_progress_bar,
                                ascii=self.use_ascii_progress_bar)

                for i, member in enumerate(progress):
                    if self.disable_progress_bar:
                        self.logger.info(
                            '  {title:<15s} [{item_id:d}/{total:d}] {file:<30s}'
                            .format(title='Extract ',
                                    item_id=i,
                                    total=len(progress),
                                    file=member.filename))

                    if not omit_first_level or len(member.filename) > offset:
                        if omit_first_level:
                            member.filename = member.filename[offset:]

                        progress.set_description("{0: >35s}".format(
                            member.filename.split('/')[-1]))
                        progress.update()

                        if not os.path.isfile(
                                os.path.join(target_path,
                                             member.filename)) or overwrite:
                            try:
                                if hasattr(self, 'package_password'
                                           ) and self.package_password:
                                    z.extract(member=member,
                                              path=target_path,
                                              pwd=self.package_password)

                                else:
                                    z.extract(member=member, path=target_path)

                            except KeyboardInterrupt:
                                # Delete latest file, since most likely it was not extracted fully
                                os.remove(
                                    os.path.join(target_path, member.filename))

                                # Quit
                                sys.exit()

                        file_count += 1

        elif self.format == FileFormat.TAR:
            tar = tarfile.open(self.filename, "r:gz")
            progress = tqdm(tar,
                            desc="{0: <25s}".format('Extract'),
                            file=sys.stdout,
                            leave=False,
                            disable=self.disable_progress_bar,
                            ascii=self.use_ascii_progress_bar)

            for i, tar_info in enumerate(progress):
                if self.disable_progress_bar:
                    self.logger.info(
                        '  {title:<15s} [{item_id:d}/{total:d}] {file:<30s}'.
                        format(title='Extract ',
                               item_id=i,
                               total=len(progress),
                               file=tar_info.name))

                if not os.path.isfile(os.path.join(
                        target_path, tar_info.name)) or overwrite:
                    tar.extract(tar_info, target_path)

                tar.members = []
            tar.close()

        return self
コード例 #4
0
ファイル: dcase2016.py プロジェクト: stachu86/dcase_util
    def prepare(self):
        """Prepare dataset for the usage.

        Returns
        -------
        self

        """

        if not self.meta_container.exists():
            scene_label = 'home'

            dcase_cross_val_data = ListDictContainer(filename=os.path.join(
                self.local_path, 'chime_home',
                'development_chunks_refined_crossval_dcase2016.csv')).load(
                    fields=['id', 'filename', 'set_id'])

            audio_files = {}
            for item in dcase_cross_val_data:
                audio_filename = os.path.join(
                    'chime_home', 'chunks',
                    item['filename'] + self.sample_mode + '.wav')
                annotation_filename = os.path.join('chime_home', 'chunks',
                                                   item['filename'] + '.csv')

                if audio_filename not in audio_files:
                    audio_files[audio_filename] = {
                        'audio': audio_filename,
                        'meta': annotation_filename
                    }

            meta_data = MetaDataContainer()
            for audio_filename, data in iteritems(audio_files):
                current_meta_data = DictContainer(filename=os.path.join(
                    self.local_path, data['meta'])).load()
                tags = []
                for i, tag in enumerate(current_meta_data['majorityvote']):
                    if tag != 'S' and tag != 'U':
                        tags.append(self.tagcode_to_taglabel(tag))

                name = os.path.split(audio_filename)[1]
                segment_name = name[0:name.find('_chunk')]
                chunk_name = name[name.find('_chunk') + 1:].split('.')[0]

                item = MetaDataItem({
                    'filename': audio_filename,
                    'scene_label': scene_label,
                    'tags': ';'.join(tags) + ';',
                    'identifier': segment_name
                })

                self.process_meta_item(item=item, absolute_path=False)

                meta_data.append(item)

            # Save meta
            meta_data.save(filename=self.meta_file)

            # Load meta and cross validation
            self.load()

        all_folds_found = True
        for fold in range(1, self.crossvalidation_folds + 1):
            train_filename = self.evaluation_setup_filename(setup_part='train',
                                                            fold=fold)

            test_filename = self.evaluation_setup_filename(setup_part='test',
                                                           fold=fold)

            eval_filename = self.evaluation_setup_filename(
                setup_part='evaluate', fold=fold)

            if not os.path.isfile(train_filename):
                all_folds_found = False

            if not os.path.isfile(test_filename):
                all_folds_found = False

            if not os.path.isfile(eval_filename):
                all_folds_found = False

        if not all_folds_found:
            Path().makedirs(path=self.evaluation_setup_path)

            dcase_crossval = {
                1: [],
                2: [],
                3: [],
                4: [],
                5: [],
            }
            dcase_cross_val_data = ListDictContainer(filename=os.path.join(
                self.local_path, 'chime_home',
                'development_chunks_refined_crossval_dcase2016.csv')).load(
                    fields=['id', 'filename', 'set_id'])

            for item in dcase_cross_val_data:
                dcase_crossval[int(item['set_id']) + 1].append(
                    self.relative_to_absolute_path(
                        os.path.join(
                            'chime_home', 'chunks',
                            item['filename'] + self.sample_mode + '.wav')))

            for fold in range(1, self.crossvalidation_folds + 1):
                # Collect training and testing files
                train_files = []
                for f in range(1, self.crossvalidation_folds + 1):
                    if f is not fold:
                        train_files += dcase_crossval[f]
                test_files = dcase_crossval[fold]

                # Create meta containers and save them

                # Train
                train_filename = self.evaluation_setup_filename(
                    setup_part='train', fold=fold)

                train_meta = MetaDataContainer(filename=train_filename)
                for filename in train_files:
                    item = self.file_meta(filename)[0]
                    self.process_meta_item(item=item, absolute_path=False)

                    train_meta.append(item)

                train_meta.save()

                # Test
                test_filename = self.evaluation_setup_filename(
                    setup_part='test', fold=fold)

                test_meta = MetaDataContainer(filename=test_filename)
                for filename in test_files:
                    item = MetaDataItem(
                        {'filename': self.absolute_to_relative_path(filename)})
                    test_meta.append(item)

                test_meta.save()

                # Evaluate
                eval_filename = self.evaluation_setup_filename(
                    setup_part='evaluate', fold=fold)

                eval_meta = MetaDataContainer(filename=eval_filename)
                for filename in test_files:
                    item = self.file_meta(filename)[0]
                    self.process_meta_item(item=item, absolute_path=False)

                    eval_meta.append(item)

                eval_meta.save()

            # Load meta and cross validation
            self.load()

        return self
コード例 #5
0
ファイル: dcase2016.py プロジェクト: stachu86/dcase_util
    def prepare(self):
        """Prepare dataset for the usage.

        Returns
        -------
        self

        """
        if not self.meta_container.exists():
            scene_label = 'home'

            evaluation_chunks = ListDictContainer(
                filename=os.path.join(self.local_path, 'chime_home',
                                      'evaluation_chunks_refined.csv')).load(
                                          fields=['id', 'filename', 'set_id'])

            audio_files = {}
            for item in dcase_cross_val_data:
                audio_filename = os.path.join(
                    'chime_home', 'chunks',
                    item['filename'] + self.sample_mode + '.wav')
                annotation_filename = os.path.join('chime_home', 'chunks',
                                                   item['filename'] + '.csv')

                if audio_filename not in audio_files:
                    audio_files[audio_filename] = {
                        'audio': audio_filename,
                        'meta': annotation_filename
                    }

            meta_data = MetaDataContainer()
            for audio_filename, data in iteritems(audio_files):
                current_meta_data = DictContainer(filename=os.path.join(
                    self.local_path, data['meta'])).load()
                tags = []
                for i, tag in enumerate(current_meta_data['majorityvote']):
                    if tag != 'S' and tag != 'U':
                        tags.append(self.tagcode_to_taglabel(tag))

                name = os.path.split(audio_filename)[1]
                segment_name = name[0:name.find('_chunk')]
                chunk_name = name[name.find('_chunk') + 1:].split('.')[0]

                item = MetaDataItem({
                    'filename': audio_filename,
                    'scene_label': scene_label,
                    'tags': ';'.join(tags) + ';',
                    'identifier': segment_name
                })

                self.process_meta_item(item=item, absolute_path=False)

                meta_data.append(item)

            # Save meta
            meta_data.save(filename=self.meta_file)

            # Load meta and cross validation
            self.load()

        all_folds_found = True

        train_filename = self.evaluation_setup_filename(setup_part='train')

        test_filename = self.evaluation_setup_filename(setup_part='test')

        eval_filename = self.evaluation_setup_filename(setup_part='evaluate')

        if not os.path.isfile(train_filename):
            all_folds_found = False

        if not os.path.isfile(test_filename):
            all_folds_found = False

        if not os.path.isfile(eval_filename):
            all_folds_found = False

        if not all_folds_found:
            Path().makedirs(path=self.evaluation_setup_path)

            # Train
            train_filename = self.evaluation_setup_filename(setup_part='train')

            train_meta = MetaDataContainer(filename=train_filename)
            for filename in self.train_files():
                train_meta.append(self.file_meta(filename)[0])

            train_meta.save()

            # Test
            test_filename = self.evaluation_setup_filename(setup_part='test')

            test_meta = MetaDataContainer(filename=test_filename)
            for filename in self.test_files():
                test_meta.append(
                    MetaDataItem(
                        {'filename':
                         self.absolute_to_relative_path(filename)}))

            test_meta.save()

            # Evaluate
            eval_filename = self.evaluation_setup_filename(
                setup_part='evaluate')

            eval_meta = MetaDataContainer(filename=eval_filename)
            for filename in self.test_files():
                eval_meta.append(self.file_meta(filename)[0])

            eval_meta.save()

            # Load meta and cross validation
            self.load()

        return self
コード例 #6
0
    def prepare(self):
        """Prepare dataset for the usage.

        Returns
        -------
        self

        """

        if not self.meta_container.exists():
            meta_data = MetaDataContainer()

            for filename in self.audio_files:
                raw_path, raw_filename = os.path.split(filename)
                relative_path = self.absolute_to_relative_path(raw_path)

                meta_data.append(
                    MetaDataItem({
                        'filename':
                        os.path.join(relative_path, raw_filename),
                        'scene_label':
                        os.path.splitext(os.path.split(filename)[1])[0][:-2],
                    }))

            meta_data.save(filename=self.meta_file)

            self.load_meta()

        all_folds_found = True
        for fold in self.folds():
            train_filename = self.evaluation_setup_filename(setup_part='train',
                                                            fold=fold)

            test_filename = self.evaluation_setup_filename(setup_part='test',
                                                           fold=fold)

            if not os.path.isfile(train_filename):
                all_folds_found = False

            if not os.path.isfile(test_filename):
                all_folds_found = False

        if not all_folds_found:
            Path().makedirs(path=self.evaluation_setup_path)

            classes = []
            files = []
            for item in self.meta:
                classes.append(item.scene_label)
                files.append(item.filename)

            files = numpy.array(files)

            from sklearn.model_selection import StratifiedShuffleSplit
            sss = StratifiedShuffleSplit(n_splits=self.crossvalidation_folds,
                                         test_size=0.3,
                                         random_state=0)

            fold = 1
            for train_index, test_index in sss.split(X=numpy.zeros(
                    len(classes)),
                                                     y=classes):
                train_files = files[train_index]
                test_files = files[test_index]
                train_filename = self.evaluation_setup_filename(
                    setup_part='train', fold=fold)

                test_filename = self.evaluation_setup_filename(
                    setup_part='test', fold=fold)

                eval_filename = self.evaluation_setup_filename(
                    setup_part='evaluate', fold=fold)

                # Create meta containers and save them

                # Train
                train_meta = MetaDataContainer(filename=train_filename)

                for filename in train_files:
                    train_meta += self.meta_container.filter(filename=filename)

                train_meta.save()

                # Test
                test_meta = MetaDataContainer(filename=test_filename)

                for filename in test_files:
                    test_meta.append(
                        MetaDataItem({
                            'filename':
                            self.absolute_to_relative_path(filename)
                        }))

                test_meta.save()

                # Evaluate
                eval_meta = MetaDataContainer(filename=eval_filename)

                for filename in test_files:
                    eval_meta += self.meta_container.filter(filename=filename)

                eval_meta.save()

                fold += 1

        # Load meta and cross validation
        self.load()

        return self
コード例 #7
0
    def extract_packages(self):
        """Extract the dataset packages

        Raises
        ------
        IOError
            Local package was not found.

        Returns
        -------
        self

        """
        # Make sure evaluation_setup directory exists
        Path().makedirs(
            path=os.path.join(self.local_path, self.evaluation_setup_folder))

        log = FancyLogger()

        item_access_log_filename = os.path.join(self.local_path,
                                                'item_access_error.log.csv')
        if 'audio' in self.included_content_types or self.included_content_types == [
                'all'
        ]:  # mean process audio
            log.title("Download_data")
            log.info(
                "Once database is downloaded, do not forget to check your missing_files"
            )

            non_existing_videos = pandas.DataFrame(
                columns=["filename", "error"])

            log.line("check files exist or download data")
            # Collect file ids
            for package in self.package_list:
                if package.get('content_type') == "meta":
                    base_filepath = os.path.splitext(
                        package.get('filename').split('/')[-1])[0]
                    if 'train' in package.get('filename'):
                        result_audio_directory = os.path.join(
                            self.local_path, 'dataset/audio/train',
                            base_filepath)
                    else:
                        result_audio_directory = os.path.join(
                            self.local_path, 'dataset/audio/test')

                    missing_files = download(package.get('filename'),
                                             result_audio_directory,
                                             n_jobs=3)
                    if not missing_files.empty:
                        non_existing_videos = non_existing_videos.append(
                            missing_files, ignore_index=True)

            # Save list of non-accessible videos
            ListDictContainer(non_existing_videos.to_dict(orient="records"),
                              filename=item_access_log_filename).save(
                                  fields=['filename', 'error'])

        # Evaluation setup filenames
        train_filename_fold1 = self.evaluation_setup_filename(
            setup_part='train', fold=1, file_extension='csv')

        test_filename_fold1 = self.evaluation_setup_filename(
            setup_part='test', fold=1, file_extension='csv')

        train_filename_fold2 = self.evaluation_setup_filename(
            setup_part='train', fold=2, file_extension='csv')

        test_filename_fold2 = self.evaluation_setup_filename(
            setup_part='test', fold=2, file_extension='csv')

        evaluate_filename = self.evaluation_setup_filename(
            setup_part='evaluate', fold=2, file_extension='csv')

        # Check that evaluation setup exists
        evaluation_setup_exists = True
        if not os.path.isfile(train_filename_fold1) or not os.path.isfile(test_filename_fold1) \
                or not os.path.isfile(train_filename_fold2) or not os.path.isfile(test_filename_fold2) \
                or not os.path.isfile(evaluate_filename) or not self.meta_container.exists():
            evaluation_setup_exists = False

        if not evaluation_setup_exists:
            # Evaluation setup was not found, generate one
            item_access_log_filename = os.path.join(
                self.local_path, 'item_access_error.log.csv')
            non_existing_videos = ListDictContainer().load(
                filename=item_access_log_filename,
                delimiter=',').get_field_unique('filename')

            train_meta_weak_fold1 = MetaDataContainer()
            audio_path = 'dataset/audio/train/weak'
            for item in MetaDataContainer().load(os.path.join(
                    self.local_path, 'dataset/metadata/train/'
                    'weak.csv'),
                                                 fields=["filename", "tags"],
                                                 csv_header=True):
                if item.filename not in non_existing_videos:
                    if not item.filename.endswith(
                            self.default_audio_extension):
                        item.filename = os.path.join(
                            audio_path,
                            os.path.splitext(item.filename)[0] + '.' +
                            self.default_audio_extension)
                    else:
                        item.filename = Path(path=item.filename).modify(
                            path_base=audio_path)

                    # Only collect items which exists if audio present
                    if 'audio' in self.included_content_types or 'all' in self.included_content_types:
                        if os.path.isfile(
                                os.path.join(self.local_path, item.filename)):
                            train_meta_weak_fold1.append(item)
                    else:
                        train_meta_weak_fold1.append(item)

            train_meta_weak_fold1.save(filename=train_filename_fold1,
                                       csv_header=True,
                                       file_format="CSV")

            test_meta_unlabel_fold1 = MetaDataContainer()
            audio_path = 'dataset/audio/train/unlabel_in_domain'
            for item in MetaDataContainer().load(os.path.join(
                    self.local_path, 'dataset/metadata/train/'
                    'unlabel_in_domain.csv'),
                                                 csv_header=True):

                if item.filename not in non_existing_videos:
                    # If not the right extension, change it
                    if not item.filename.endswith(
                            self.default_audio_extension):
                        item.filename = os.path.join(
                            audio_path,
                            os.path.splitext(item.filename)[0] + '.' +
                            self.default_audio_extension)
                    else:
                        item.filename = Path(path=item.filename).modify(
                            path_base=audio_path)

                    # Only collect items which exists if audio present
                    if 'audio' in self.included_content_types or 'all' in self.included_content_types:
                        if os.path.isfile(
                                os.path.join(self.local_path, item.filename)):
                            test_meta_unlabel_fold1.append(item)
                    else:
                        test_meta_unlabel_fold1.append(item)

            test_meta_unlabel_fold1.save(filename=test_filename_fold1,
                                         csv_header=True,
                                         file_format="CSV")

            # Fold 2 train is all the data used in fold 1
            train_meta_weak_fold2 = MetaDataContainer()
            train_meta_weak_fold2 += MetaDataContainer().load(
                train_filename_fold1, csv_header=True, file_format="CSV")

            for item in MetaDataContainer().load(test_filename_fold1,
                                                 csv_header=True,
                                                 file_format="CSV"):
                item.tags = []
                train_meta_weak_fold2.append(item)

            train_meta_weak_fold2.save(filename=train_filename_fold2,
                                       csv_header=True)

            # Evaluate meta is the groundtruth file with test annotations test.csv
            evaluate_meta = MetaDataContainer()
            audio_path = 'dataset/audio/test'
            for item in MetaDataContainer().load(os.path.join(
                    self.local_path, 'dataset/metadata/test/test.csv'),
                                                 csv_header=True):

                if item.filename not in non_existing_videos:
                    if not item.filename.endswith(
                            self.default_audio_extension):
                        item.filename = os.path.join(
                            audio_path,
                            os.path.splitext(item.filename)[0] + '.' +
                            self.default_audio_extension)
                    else:
                        item.filename = Path(path=item.filename).modify(
                            path_base=audio_path)

                    # Only collect items which exists
                    if 'audio' in self.included_content_types or 'all' in self.included_content_types:
                        if os.path.isfile(
                                os.path.join(self.local_path, item.filename)):
                            evaluate_meta.append(item)
                    else:
                        evaluate_meta.append(item)

            evaluate_meta.save(filename=evaluate_filename,
                               csv_header=True,
                               file_format="CSV")

            # Test meta is filenames of evaluation, labels will be predicted
            test_meta_strong_fold2 = MetaDataContainer()
            for filename in evaluate_meta.unique_files:
                test_meta_strong_fold2.append(
                    MetaDataItem({'filename': filename}))

            test_meta_strong_fold2.save(filename=test_filename_fold2,
                                        csv_header=True,
                                        file_format="CSV")

            # meta_data is the default meta container containing all files of the dataset
            meta_data = MetaDataContainer()
            meta_data += MetaDataContainer().load(train_filename_fold1,
                                                  csv_header=True,
                                                  file_format="CSV")

            meta_data += MetaDataContainer().load(test_filename_fold1,
                                                  csv_header=True,
                                                  file_format="CSV")

            meta_data += MetaDataContainer().load(test_filename_fold2,
                                                  csv_header=True,
                                                  file_format="CSV")
            # Save meta
            meta_data.save(filename=self.meta_file)

        log.foot()

        return self
コード例 #8
0
ファイル: dcase2017.py プロジェクト: yuliangzhang/dcase_util
    def prepare(self):
        """Prepare dataset for the usage.

        Returns
        -------
        self

        """

        # Make sure evaluation_setup directory exists
        Path().makedirs(
            path=os.path.join(self.local_path, self.evaluation_setup_folder))

        reference_data_file = os.path.join(
            self.local_path, 'groundtruth_strong_label_evaluation_set.csv')
        if not self.meta_container.exists() and os.path.exists(
                reference_data_file):
            # Reference data is present and but meta data is empty
            meta_data = MetaDataContainer()
            ref_data = MetaDataContainer().load(filename=reference_data_file)

            for item in ref_data:
                # Modify audio file path
                item.filename = os.path.join(
                    'Y' + os.path.splitext(item.filename)[0] + '.' +
                    self.default_audio_extension)

                # Set scene label
                item.scene_label = 'youtube'

                # Only collect items which exists
                if os.path.isfile(os.path.join(self.local_path,
                                               item.filename)):
                    meta_data.append(item)

            # Save meta data
            meta_data.save(filename=self.meta_container.filename)

            # Load meta and cross validation
            self.load()

        test_filename = self.evaluation_setup_filename(setup_part='test',
                                                       scene_label='youtube',
                                                       file_extension='txt')

        evaluate_filename = self.evaluation_setup_filename(
            setup_part='evaluate', scene_label='youtube', file_extension='txt')

        # Check that evaluation setup exists
        evaluation_setup_exists = True
        if not os.path.isfile(test_filename) or not os.path.isfile(
                evaluate_filename):
            evaluation_setup_exists = False

        if not evaluation_setup_exists:
            if os.path.exists(reference_data_file):
                ref_data = MetaDataContainer().load(
                    filename=reference_data_file)
                evaluate_meta = MetaDataContainer()
                for item in ref_data:
                    # Modify audio file path
                    if not item.filename.endswith(
                            self.default_audio_extension):
                        item.filename = os.path.join(
                            'audio', 'Y' + os.path.splitext(item.filename)[0] +
                            '.' + self.default_audio_extension)

                    # Set scene label
                    item.scene_label = 'youtube'

                    self.process_meta_item(item=item, absolute_path=False)

                    evaluate_meta.append(item)

                evaluate_meta.save(filename=self.evaluation_setup_filename(
                    setup_part='evaluate',
                    scene_label='youtube',
                    file_extension='txt'))

            audio_files = Path().file_list(path=self.local_path,
                                           extensions=self.audio_extensions)

            test_meta = MetaDataContainer()
            for audio_file in audio_files:
                item = MetaDataItem({
                    'filename': os.path.split(audio_file)[1],
                    'scene_label': 'youtube'
                })
                self.process_meta_item(item=item, absolute_path=False)

                test_meta.append(item)

            test_meta.save(filename=self.evaluation_setup_filename(
                setup_part='test', scene_label='youtube',
                file_extension='txt'))

            # Load meta and cross validation
            self.load()

        return self
コード例 #9
0
ファイル: dcase2017.py プロジェクト: yuliangzhang/dcase_util
    def prepare(self):
        """Prepare dataset for the usage.

        Returns
        -------
        self

        """

        if is_jupyter():
            from tqdm import tqdm_notebook as tqdm
        else:
            from tqdm import tqdm

        # Make sure audio directory exists
        Path().makedirs(path=os.path.join(self.local_path, 'audio'))

        # Make sure evaluation_setup directory exists
        Path().makedirs(
            path=os.path.join(self.local_path, self.evaluation_setup_folder))

        if 'audio' in self.included_content_types:
            # Collect file ids
            files = []
            files += ListDictContainer(filename=os.path.join(
                self.local_path, 'testing_set.csv')).load(
                    fields=['query_id', 'segment_start', 'segment_end'])

            files += ListDictContainer(filename=os.path.join(
                self.local_path, 'training_set.csv')).load(
                    fields=['query_id', 'segment_start', 'segment_end'])

            file_progress = tqdm(files,
                                 desc="{0: <25s}".format('Files'),
                                 file=sys.stdout,
                                 leave=False,
                                 disable=self.disable_progress_bar,
                                 ascii=self.use_ascii_progress_bar)

            non_existing_videos = {}

            # Load list of already identified non-accessible videos
            item_access_log_filename = os.path.join(
                self.local_path, 'item_access_error.log.csv')
            if os.path.isfile(item_access_log_filename):
                for item in ListDictContainer(
                        filename=item_access_log_filename).load(
                            fields=['query_id', 'error']):
                    non_existing_videos[item['query_id']] = item

            # Check that audio files exists
            for file_data in file_progress:
                audio_filename = os.path.join(
                    self.local_path, 'audio',
                    'Y{query_id}_{segment_start}_{segment_end}.{extension}'.
                    format(query_id=file_data['query_id'],
                           segment_start=file_data['segment_start'],
                           segment_end=file_data['segment_end'],
                           extension=self.default_audio_extension))

                # Download segment if it does not exists
                if not os.path.isfile(audio_filename) and file_data[
                        'query_id'] not in non_existing_videos:
                    try:
                        AudioContainer().load_from_youtube(
                            query_id=file_data['query_id'],
                            start=file_data['segment_start'],
                            stop=file_data['segment_end']).save(
                                filename=audio_filename)

                    except IOError as e:
                        non_existing_videos[file_data['query_id']] = {
                            'error': str(e.message).replace('\n', ' '),
                            'query_id': file_data['query_id']
                        }

            # Save list of non-accessible videos
            ListDictContainer(list(non_existing_videos.values()),
                              filename=item_access_log_filename).save(
                                  fields=['query_id', 'error'])

        # Evaluation setup filenames
        train_filename = self.evaluation_setup_filename(setup_part='train',
                                                        fold=1,
                                                        scene_label='youtube',
                                                        file_extension='txt')

        test_filename = self.evaluation_setup_filename(setup_part='test',
                                                       fold=1,
                                                       scene_label='youtube',
                                                       file_extension='txt')

        evaluate_filename = self.evaluation_setup_filename(
            setup_part='evaluate',
            fold=1,
            scene_label='youtube',
            file_extension='txt')

        # Check that evaluation setup exists
        evaluation_setup_exists = True
        if not os.path.isfile(train_filename) or not os.path.isfile(
                test_filename) or not os.path.isfile(evaluate_filename):
            evaluation_setup_exists = False

        if not evaluation_setup_exists:
            # Evaluation setup was not found, generate one
            fold = 1

            train_meta = MetaDataContainer()
            for item in MetaDataContainer().load(
                    os.path.join(self.local_path,
                                 'groundtruth_weak_label_training_set.csv')):
                if not item.filename.endswith(self.default_audio_extension):
                    item.filename = os.path.join(
                        'audio', 'Y' + os.path.splitext(item.filename)[0] +
                        '.' + self.default_audio_extension)

                # Set scene label
                item.scene_label = 'youtube'

                # Translate event onset and offset, weak labels
                item.offset -= item.onset
                item.onset -= item.onset

                # Only collect items which exists if audio present
                if 'audio' in self.included_content_types:
                    if os.path.isfile(
                            os.path.join(self.local_path, item.filename)):
                        train_meta.append(item)
                else:
                    train_meta.append(item)

            train_meta.save(
                filename=self.evaluation_setup_filename(setup_part='train',
                                                        fold=fold,
                                                        scene_label='youtube',
                                                        file_extension='txt'))

            evaluate_meta = MetaDataContainer()
            for item in MetaDataContainer().load(
                    os.path.join(self.local_path,
                                 'groundtruth_strong_label_testing_set.csv')):
                if not item.filename.endswith(self.default_audio_extension):
                    item.filename = os.path.join(
                        'audio', 'Y' + os.path.splitext(item.filename)[0] +
                        '.' + self.default_audio_extension)
                # Set scene label
                item.scene_label = 'youtube'

                # Only collect items which exists
                if 'audio' in self.included_content_types:
                    if os.path.isfile(
                            os.path.join(self.local_path, item.filename)):
                        evaluate_meta.append(item)
                else:
                    evaluate_meta.append(item)

            evaluate_meta.save(
                filename=self.evaluation_setup_filename(setup_part='evaluate',
                                                        fold=fold,
                                                        scene_label='youtube',
                                                        file_extension='txt'))

            test_meta = MetaDataContainer()
            for item in evaluate_meta:
                test_meta.append(MetaDataItem({'filename': item.filename}))

            test_meta.save(
                filename=self.evaluation_setup_filename(setup_part='test',
                                                        fold=fold,
                                                        scene_label='youtube',
                                                        file_extension='txt'))

            # Load meta and cross validation
            self.load()

        if not self.meta_container.exists():
            fold = 1
            meta_data = MetaDataContainer()
            meta_data += MetaDataContainer().load(
                self.evaluation_setup_filename(setup_part='train',
                                               fold=fold,
                                               scene_label='youtube',
                                               file_extension='txt'))

            meta_data += MetaDataContainer().load(
                self.evaluation_setup_filename(setup_part='evaluate',
                                               fold=fold,
                                               scene_label='youtube',
                                               file_extension='txt'))
            # Save meta
            meta_data.save(filename=self.meta_file)

            # Load meta and cross validation
            self.load()

        return self
コード例 #10
0
ファイル: datasets.py プロジェクト: stachu86/dcase_util
    def pack(self,
             dataset_name='dcase-dataset',
             content=None,
             output_path=None,
             base_path=None,
             overwrite=False,
             verbose=True):
        """Pack dataset.

        Parameters
        ----------
        dataset_name : str
            Dataset name
            Default value 'dcase-dataset'

        content : list of dict
            List of packages to be packed. Package item dict should have format {'data_name': 'doc', 'file_list': [{'source': 'file1.txt'}]}.
            Default value None

        output_path : str
            Path to which packages are saved.
            Default value None

        base_path : str
            Base path of the data. If per item package paths are not given ('target' field), this parameter is used
            to create one from source path.
            Default value None

        overwrite : bool
            Overwrite existing packages.
            Default value False

        verbose : bool
            Show information during the packing.
            Default value True

        Returns
        -------
        nothing

        """

        if verbose:
            log = FancyLogger()
            log.section_header('Packing dataset [{dataset_name}]'.format(
                dataset_name=dataset_name))

        if base_path is not None and not base_path.endswith(os.path.sep):
            base_path += os.path.sep

        for group in content:
            if verbose:
                log.line('[{data_name}]'.format(data_name=group['data_name']))

            package_filename = os.path.join(
                output_path,
                self.filename_template.format(
                    dataset_name=dataset_name,
                    data_name=group['data_name'],
                    extension=self.package_extension))

            newest_source = 0
            for item in group['file_list']:
                if not os.path.exists(item['source']):
                    message = '{name}: File not found [{source_file}].'.format(
                        name=self.__class__.__name__,
                        source_file=item['source'])

                    self.logger.exception(message)
                    raise IOError(message)

                if 'target' not in item:
                    if item['source'].startswith(base_path):
                        item['target'] = item['source'][len(base_path):]
                    else:
                        item['target'] = item['source']

                timestamp = os.path.getmtime(item['source'])
                if newest_source < timestamp:
                    newest_source = timestamp

            # Get newest package, take care of split packages
            all_packages = Path().file_list(
                path=os.path.split(os.path.abspath(package_filename))[0],
                extensions=os.path.splitext(package_filename)[1][1:])

            newest_package = 0
            for package in all_packages:
                base_name = os.path.splitext(os.path.split(package)[-1])[0]

                if base_name[-1].isdigit():
                    base_name = os.path.splitext(base_name)[0]

                if base_name == os.path.splitext(
                        os.path.split(package_filename)[-1])[0]:
                    timestamp = os.path.getmtime(package)
                    if newest_package < timestamp:
                        newest_package = timestamp

            if newest_package < newest_source or overwrite:
                if self.convert_md_to_html:
                    # Check for markdown content
                    new_files = []
                    for item in group['file_list']:
                        if os.path.splitext(item['source'])[-1] == '.md':
                            if not os.path.exists(
                                    os.path.splitext(item['source'])[0] +
                                    '.html'
                            ) or (os.path.exists(
                                    os.path.splitext(item['source'])[0] +
                                    '.html')
                                  and os.path.getmtime(
                                      item['source']) > os.path.getmtime(
                                          os.path.splitext(item['source'])[0] +
                                          '.html')) or overwrite:
                                # Convert
                                self.convert_markdown(
                                    source_filename=item['source'],
                                    target_filename=os.path.splitext(
                                        item['source'])[0] + '.html')

                                new_files.append({
                                    'source':
                                    os.path.splitext(item['source'])[0] +
                                    '.html',
                                    'target':
                                    os.path.splitext(item['target'])[0] +
                                    '.html'
                                })

                    # Add new html files to the file_list
                    group['file_list'] += new_files

                # Create packages
                package = Package(filename=package_filename)
                package_filenames = package.compress(
                    file_list=group['file_list'],
                    size_limit=self.package_size_limit)

                if verbose:
                    log.line('Saved', indent=2)

                    for i in package_filenames:
                        log.line('[{file}] [{size}]'.format(
                            file=i.replace(base_path, ''),
                            size=get_byte_string(os.path.getsize(i),
                                                 show_bytes=False)),
                                 indent=4)

        if verbose:
            log.foot()