def ensure_downloads(url=DATA_URL,target_dir=DEFAULT_CODEGOLF_DATASET_PATH): """Ensure that all of the given files have been downloaded and/or unpacked""" file_ops.mkdir_p( target_dir ) expected = os.path.join( target_dir, 'train','yes0.wav') if not os.path.exists( expected ): archive = os.path.join( target_dir, TAR_FILE ) if not os.path.exists( archive ) or os.stat( archive ).st_size != FILE_SIZE: log.info("Downloading codegolf dataset to %s", target_dir ) if not file_ops.download_file( DATA_URL, archive, ): raise RuntimeError( "Unable to download %s to %s"%( DATA_URL, archive, )) if sys.version_info.major == 3: log.info("Using Python 3.x lzma support to unpack") file_ops.untar(archive, target_dir, mode='r:xz') else: log.warn("Attempting decompresion/unpacking via tar command" ) subprocess.check_call( ['tar', '-xJf', archive]) if not os.path.exists( expected ): raise RuntimeError("Untarring the source file did not create %s"%(expected,)) log.info("CodeGolf Yes/No dataset is installed in %s"%(target_dir,)) return True
def ensure_downloads(files,base_url=BASE_URL,target_dir=DEFAULT_LIBRISPEECH_DATASET_PATH): """Ensure that all of the given files have been downloaded and/or unpacked""" log.info("Downloading librispeech to %s", target_dir ) file_ops.mkdir_p( target_dir ) for filename in files: final_filename = os.path.join( target_dir, filename ) log.info("Ensuring download: %s", final_filename) filesize = FILE_SIZES.get( filename, 'Unknown Size') size_desc = file_ops.human_bytes(filesize) if isinstance(filesize,(long,int)) else filesize if filename in DIRECTORY_NAMES: without_extension = os.path.join( target_dir, DIRECTORY_NAMES[filename]) else: without_extension = final_filename[:-7] if not os.path.exists( without_extension ): if (not os.path.exists( final_filename )) or not( os.stat(final_filename).st_size == filesize): final_url = base_url + filename log.info("Need to download %s (%s)", final_url,size_desc ) if not file_ops.download_file( final_url, final_filename, ): raise RuntimeError("Unable to download %s to %s"%( final_url,final_filename, )) working = tempfile.mkdtemp(dir=target_dir,prefix="unpack-",suffix="-tmp") try: file_ops.untar(final_filename, working) text_files = [] for name in glob.glob(os.path.join(working,'LibriSpeech','*')): if os.path.basename( name ) == os.path.basename(without_extension): os.rename( name, without_extension ) elif os.path.splitext(name)[1].upper() == '.TXT': text_files.append( name ) else: log.warn("Unexpected directory in %s: %r",final_filename, name) for text_file in text_files: os.rename( text_file, os.path.join( without_extension, os.path.basename(text_file))) if not os.path.exists( without_extension ): raise RuntimeError( "Unable to find the directory %s expected from %s"%( without_extension, final_filename, ) ) finally: shutil.rmtree( working )
def install(self): ''' Method to both download and extract the dataset from the internet (if there) or verify connection settings ''' file_type = None if self.filename is not None: log.info('Installing dataset %s', str(self.filename)) # construct the actual path to the dataset prevdir = os.getcwd() os.chdir(os.path.split(os.path.realpath(__file__))[0]) dataset_dir = os.path.realpath(self.dataset_dir) try: mkdir_p(dataset_dir) dataset_location = os.path.join(dataset_dir, self.filename) except Exception as e: log.error("Couldn't make the dataset path with directory %s and filename %s", dataset_dir, str(self.filename)) log.exception("%s", str(e)) dataset_location = None finally: os.chdir(prevdir) # check if the dataset is already in the source, otherwise download it. # first check if the base filename exists - without all the extensions. # then, add each extension on and keep checking until the upper level, when you download from http. if dataset_location is not None: (dirs, fname) = os.path.split(dataset_location) split_fname = fname.split('.') accumulated_name = split_fname[0] found = False # first check if the filename was a directory (like for the midi datasets) if os.path.exists(os.path.join(dirs, accumulated_name)): found = True file_type = get_file_type(os.path.join(dirs, accumulated_name)) dataset_location = os.path.join(dirs, accumulated_name) log.debug('Found file %s', dataset_location) # now go through the file extensions starting with the lowest level and check if the file exists if not found and len(split_fname) > 1: for chunk in split_fname[1:]: accumulated_name = '.'.join((accumulated_name, chunk)) file_type = get_file_type(os.path.join(dirs, accumulated_name)) if file_type is not None: dataset_location = os.path.join(dirs, accumulated_name) log.debug('Found file %s', dataset_location) break # if the file wasn't found, download it if a source was provided. Otherwise, raise error. download_success = True if self.source is not None: if file_type is None: download_success = download_file(self.source, dataset_location) file_type = get_file_type(dataset_location) else: log.error("Filename %s couldn't be found, and no URL source to download was provided.", str(self.filename)) raise RuntimeError("Filename %s couldn't be found, and no URL source to download was provided." % str(self.filename)) # if the file type is a zip, unzip it. unzip_success = True if file_type is files.ZIP: (dirs, fname) = os.path.split(dataset_location) post_unzip = os.path.join(dirs, '.'.join(fname.split('.')[0:-1])) unzip_success = files.unzip(dataset_location, post_unzip) # if the unzip was successful if unzip_success: # remove the zipfile and update the dataset location and file type log.debug('Removing file %s', dataset_location) os.remove(dataset_location) dataset_location = post_unzip file_type = get_file_type(dataset_location) if download_success and unzip_success: log.info('Installation complete. Yay!') else: log.warning('Something went wrong installing dataset. Boo :(') return dataset_location, file_type
def install(self): ''' Method to both download and extract the dataset from the internet (if applicable) or verify that the file exists in the dataset_dir. Returns ------- str The absolute path to the dataset location on disk. int The integer representing the file type for the dataset, as defined in the opendeep.utils.file_ops module. ''' file_type = None if self.filename is not None: log.info('Installing dataset %s', str(self.filename)) # construct the actual path to the dataset prevdir = os.getcwd() os.chdir(os.path.split(os.path.realpath(__file__))[0]) dataset_dir = os.path.realpath(self.dataset_dir) try: mkdir_p(dataset_dir) dataset_location = os.path.join(dataset_dir, self.filename) except Exception as e: log.error("Couldn't make the dataset path with directory %s and filename %s", dataset_dir, str(self.filename)) log.exception("%s", str(e)) dataset_location = None finally: os.chdir(prevdir) # check if the dataset is already in the source, otherwise download it. # first check if the base filename exists - without all the extensions. # then, add each extension on and keep checking until the upper level, when you download from http. if dataset_location is not None: (dirs, fname) = os.path.split(dataset_location) split_fname = fname.split('.') accumulated_name = split_fname[0] found = False # first check if the filename was a directory (like for the midi datasets) if os.path.exists(os.path.join(dirs, accumulated_name)): found = True file_type = get_file_type(os.path.join(dirs, accumulated_name)) dataset_location = os.path.join(dirs, accumulated_name) log.debug('Found file %s', dataset_location) # now go through the file extensions starting with the lowest level and check if the file exists if not found and len(split_fname) > 1: for chunk in split_fname[1:]: accumulated_name = '.'.join((accumulated_name, chunk)) file_type = get_file_type(os.path.join(dirs, accumulated_name)) if file_type is not None: dataset_location = os.path.join(dirs, accumulated_name) log.debug('Found file %s', dataset_location) break # if the file wasn't found, download it if a source was provided. Otherwise, raise error. download_success = True if self.source is not None: if file_type is None: download_success = download_file(self.source, dataset_location) file_type = get_file_type(dataset_location) else: log.error("Filename %s couldn't be found, and no URL source to download was provided.", str(self.filename)) raise RuntimeError("Filename %s couldn't be found, and no URL source to download was provided." % str(self.filename)) # if the file type is a zip, unzip it. unzip_success = True if file_type is files.ZIP: (dirs, fname) = os.path.split(dataset_location) post_unzip = os.path.join(dirs, '.'.join(fname.split('.')[0:-1])) unzip_success = files.unzip(dataset_location, post_unzip) # if the unzip was successful if unzip_success: # remove the zipfile and update the dataset location and file type log.debug('Removing file %s', dataset_location) os.remove(dataset_location) dataset_location = post_unzip file_type = get_file_type(dataset_location) if download_success and unzip_success: log.info('Installation complete. Yay!') else: log.warning('Something went wrong installing dataset. Boo :(') return dataset_location, file_type