Esempio n. 1
0
def ensure_downloads(url=DATA_URL,target_dir=DEFAULT_CODEGOLF_DATASET_PATH):
    """Ensure that all of the given files have been downloaded and/or unpacked"""
    file_ops.mkdir_p( target_dir )
    expected = os.path.join( target_dir, 'train','yes0.wav')
    if not os.path.exists( expected ):
        archive = os.path.join( target_dir, TAR_FILE )
        if not os.path.exists( archive ) or os.stat( archive ).st_size != FILE_SIZE:
            log.info("Downloading codegolf dataset to %s", target_dir )
            if not file_ops.download_file(
                DATA_URL,
                archive,
            ):
                raise RuntimeError( "Unable to download %s to %s"%(
                    DATA_URL,
                    archive,
                ))
        if sys.version_info.major == 3:
            log.info("Using Python 3.x lzma support to unpack")
            file_ops.untar(archive, target_dir, mode='r:xz')
        else:
            log.warn("Attempting decompresion/unpacking via tar command" )
            subprocess.check_call( ['tar', '-xJf', archive])
        if not os.path.exists( expected ):
            raise RuntimeError("Untarring the source file did not create %s"%(expected,))
    log.info("CodeGolf Yes/No dataset is installed in %s"%(target_dir,))
    return True
Esempio n. 2
0
def ensure_downloads(files,base_url=BASE_URL,target_dir=DEFAULT_LIBRISPEECH_DATASET_PATH):
    """Ensure that all of the given files have been downloaded and/or unpacked"""
    log.info("Downloading librispeech to %s", target_dir )
    file_ops.mkdir_p( target_dir )
    for filename in files:
        final_filename = os.path.join( target_dir, filename )
        log.info("Ensuring download: %s", final_filename)
        filesize = FILE_SIZES.get( filename, 'Unknown Size')
        size_desc = file_ops.human_bytes(filesize) if isinstance(filesize,(long,int)) else filesize
        if filename in DIRECTORY_NAMES:
            without_extension = os.path.join( target_dir, DIRECTORY_NAMES[filename])
        else:
            without_extension = final_filename[:-7]
        
        if not os.path.exists( without_extension ):
            if (not os.path.exists( final_filename )) or not( os.stat(final_filename).st_size == filesize):
                final_url = base_url + filename
                log.info("Need to download %s (%s)", final_url,size_desc )
                if not file_ops.download_file(
                    final_url,
                    final_filename,
                ):
                    raise RuntimeError("Unable to download %s to %s"%(
                        final_url,final_filename,
                    ))
            working = tempfile.mkdtemp(dir=target_dir,prefix="unpack-",suffix="-tmp")
            try:
                file_ops.untar(final_filename, working)
                text_files = []
                for name in glob.glob(os.path.join(working,'LibriSpeech','*')):
                    if os.path.basename( name ) == os.path.basename(without_extension):
                        os.rename( name, without_extension )
                    elif os.path.splitext(name)[1].upper() == '.TXT':
                        text_files.append( name )
                    else:
                        log.warn("Unexpected directory in %s: %r",final_filename, name)
                for text_file in text_files:
                    os.rename( text_file, os.path.join( without_extension, os.path.basename(text_file)))
                if not os.path.exists( without_extension ):
                    raise RuntimeError(
                        "Unable to find the directory %s expected from %s"%(
                            without_extension,
                            final_filename,
                        )
                    )
            finally:
                shutil.rmtree( working )
Esempio n. 3
0
    def install(self):
        '''
        Method to both download and extract the dataset from the internet (if there) or verify connection settings
        '''
        file_type = None
        if self.filename is not None:
            log.info('Installing dataset %s', str(self.filename))
            # construct the actual path to the dataset
            prevdir = os.getcwd()
            os.chdir(os.path.split(os.path.realpath(__file__))[0])
            dataset_dir = os.path.realpath(self.dataset_dir)
            try:
                mkdir_p(dataset_dir)
                dataset_location = os.path.join(dataset_dir, self.filename)
            except Exception as e:
                log.error("Couldn't make the dataset path with directory %s and filename %s",
                          dataset_dir,
                          str(self.filename))
                log.exception("%s", str(e))
                dataset_location = None
            finally:
                os.chdir(prevdir)

            # check if the dataset is already in the source, otherwise download it.
            # first check if the base filename exists - without all the extensions.
            # then, add each extension on and keep checking until the upper level, when you download from http.
            if dataset_location is not None:
                (dirs, fname) = os.path.split(dataset_location)
                split_fname = fname.split('.')
                accumulated_name = split_fname[0]
                found = False
                # first check if the filename was a directory (like for the midi datasets)
                if os.path.exists(os.path.join(dirs, accumulated_name)):
                    found = True
                    file_type = get_file_type(os.path.join(dirs, accumulated_name))
                    dataset_location = os.path.join(dirs, accumulated_name)
                    log.debug('Found file %s', dataset_location)
                # now go through the file extensions starting with the lowest level and check if the file exists
                if not found and len(split_fname) > 1:
                    for chunk in split_fname[1:]:
                        accumulated_name = '.'.join((accumulated_name, chunk))
                        file_type = get_file_type(os.path.join(dirs, accumulated_name))
                        if file_type is not None:
                            dataset_location = os.path.join(dirs, accumulated_name)
                            log.debug('Found file %s', dataset_location)
                            break

            # if the file wasn't found, download it if a source was provided. Otherwise, raise error.
            download_success = True
            if self.source is not None:
                if file_type is None:
                    download_success = download_file(self.source, dataset_location)
                    file_type = get_file_type(dataset_location)
            else:
                log.error("Filename %s couldn't be found, and no URL source to download was provided.",
                          str(self.filename))
                raise RuntimeError("Filename %s couldn't be found, and no URL source to download was provided." %
                                   str(self.filename))

            # if the file type is a zip, unzip it.
            unzip_success = True
            if file_type is files.ZIP:
                (dirs, fname) = os.path.split(dataset_location)
                post_unzip = os.path.join(dirs, '.'.join(fname.split('.')[0:-1]))
                unzip_success = files.unzip(dataset_location, post_unzip)
                # if the unzip was successful
                if unzip_success:
                    # remove the zipfile and update the dataset location and file type
                    log.debug('Removing file %s', dataset_location)
                    os.remove(dataset_location)
                    dataset_location = post_unzip
                    file_type = get_file_type(dataset_location)
            if download_success and unzip_success:
                log.info('Installation complete. Yay!')
            else:
                log.warning('Something went wrong installing dataset. Boo :(')

            return dataset_location, file_type
Esempio n. 4
0
    def install(self):
        '''
        Method to both download and extract the dataset from the internet (if applicable) or verify that the file
        exists in the dataset_dir.

        Returns
        -------
        str
            The absolute path to the dataset location on disk.
        int
            The integer representing the file type for the dataset, as defined in the opendeep.utils.file_ops module.
        '''
        file_type = None
        if self.filename is not None:
            log.info('Installing dataset %s', str(self.filename))
            # construct the actual path to the dataset
            prevdir = os.getcwd()
            os.chdir(os.path.split(os.path.realpath(__file__))[0])
            dataset_dir = os.path.realpath(self.dataset_dir)
            try:
                mkdir_p(dataset_dir)
                dataset_location = os.path.join(dataset_dir, self.filename)
            except Exception as e:
                log.error("Couldn't make the dataset path with directory %s and filename %s",
                          dataset_dir,
                          str(self.filename))
                log.exception("%s", str(e))
                dataset_location = None
            finally:
                os.chdir(prevdir)

            # check if the dataset is already in the source, otherwise download it.
            # first check if the base filename exists - without all the extensions.
            # then, add each extension on and keep checking until the upper level, when you download from http.
            if dataset_location is not None:
                (dirs, fname) = os.path.split(dataset_location)
                split_fname = fname.split('.')
                accumulated_name = split_fname[0]
                found = False
                # first check if the filename was a directory (like for the midi datasets)
                if os.path.exists(os.path.join(dirs, accumulated_name)):
                    found = True
                    file_type = get_file_type(os.path.join(dirs, accumulated_name))
                    dataset_location = os.path.join(dirs, accumulated_name)
                    log.debug('Found file %s', dataset_location)
                # now go through the file extensions starting with the lowest level and check if the file exists
                if not found and len(split_fname) > 1:
                    for chunk in split_fname[1:]:
                        accumulated_name = '.'.join((accumulated_name, chunk))
                        file_type = get_file_type(os.path.join(dirs, accumulated_name))
                        if file_type is not None:
                            dataset_location = os.path.join(dirs, accumulated_name)
                            log.debug('Found file %s', dataset_location)
                            break

            # if the file wasn't found, download it if a source was provided. Otherwise, raise error.
            download_success = True
            if self.source is not None:
                if file_type is None:
                    download_success = download_file(self.source, dataset_location)
                    file_type = get_file_type(dataset_location)
            else:
                log.error("Filename %s couldn't be found, and no URL source to download was provided.",
                          str(self.filename))
                raise RuntimeError("Filename %s couldn't be found, and no URL source to download was provided." %
                                   str(self.filename))

            # if the file type is a zip, unzip it.
            unzip_success = True
            if file_type is files.ZIP:
                (dirs, fname) = os.path.split(dataset_location)
                post_unzip = os.path.join(dirs, '.'.join(fname.split('.')[0:-1]))
                unzip_success = files.unzip(dataset_location, post_unzip)
                # if the unzip was successful
                if unzip_success:
                    # remove the zipfile and update the dataset location and file type
                    log.debug('Removing file %s', dataset_location)
                    os.remove(dataset_location)
                    dataset_location = post_unzip
                    file_type = get_file_type(dataset_location)
            if download_success and unzip_success:
                log.info('Installation complete. Yay!')
            else:
                log.warning('Something went wrong installing dataset. Boo :(')

            return dataset_location, file_type