def ensure_downloads(url=DATA_URL,target_dir=DEFAULT_CODEGOLF_DATASET_PATH): """Ensure that all of the given files have been downloaded and/or unpacked""" file_ops.mkdir_p( target_dir ) expected = os.path.join( target_dir, 'train','yes0.wav') if not os.path.exists( expected ): archive = os.path.join( target_dir, TAR_FILE ) if not os.path.exists( archive ) or os.stat( archive ).st_size != FILE_SIZE: log.info("Downloading codegolf dataset to %s", target_dir ) if not file_ops.download_file( DATA_URL, archive, ): raise RuntimeError( "Unable to download %s to %s"%( DATA_URL, archive, )) if sys.version_info.major == 3: log.info("Using Python 3.x lzma support to unpack") file_ops.untar(archive, target_dir, mode='r:xz') else: log.warn("Attempting decompresion/unpacking via tar command" ) subprocess.check_call( ['tar', '-xJf', archive]) if not os.path.exists( expected ): raise RuntimeError("Untarring the source file did not create %s"%(expected,)) log.info("CodeGolf Yes/No dataset is installed in %s"%(target_dir,)) return True
def ensure_downloads(files,base_url=BASE_URL,target_dir=DEFAULT_LIBRISPEECH_DATASET_PATH): """Ensure that all of the given files have been downloaded and/or unpacked""" log.info("Downloading librispeech to %s", target_dir ) file_ops.mkdir_p( target_dir ) for filename in files: final_filename = os.path.join( target_dir, filename ) log.info("Ensuring download: %s", final_filename) filesize = FILE_SIZES.get( filename, 'Unknown Size') size_desc = file_ops.human_bytes(filesize) if isinstance(filesize,(long,int)) else filesize if filename in DIRECTORY_NAMES: without_extension = os.path.join( target_dir, DIRECTORY_NAMES[filename]) else: without_extension = final_filename[:-7] if not os.path.exists( without_extension ): if (not os.path.exists( final_filename )) or not( os.stat(final_filename).st_size == filesize): final_url = base_url + filename log.info("Need to download %s (%s)", final_url,size_desc ) if not file_ops.download_file( final_url, final_filename, ): raise RuntimeError("Unable to download %s to %s"%( final_url,final_filename, )) working = tempfile.mkdtemp(dir=target_dir,prefix="unpack-",suffix="-tmp") try: file_ops.untar(final_filename, working) text_files = [] for name in glob.glob(os.path.join(working,'LibriSpeech','*')): if os.path.basename( name ) == os.path.basename(without_extension): os.rename( name, without_extension ) elif os.path.splitext(name)[1].upper() == '.TXT': text_files.append( name ) else: log.warn("Unexpected directory in %s: %r",final_filename, name) for text_file in text_files: os.rename( text_file, os.path.join( without_extension, os.path.basename(text_file))) if not os.path.exists( without_extension ): raise RuntimeError( "Unable to find the directory %s expected from %s"%( without_extension, final_filename, ) ) finally: shutil.rmtree( working )
def __init__( self, path=DEFAULT_TEDLIUM_DATASET_PATH, window_duration = 0.01, skip_count = 1, max_speeches = None, ): """Initialize the Dataset with a given storage for TEDLIUM path -- target path for the TED LIUM data storage window_duration -- duration of the audio window in seconds skip_count -- step size across the segments in the repo used to do a very small subset of the dataset when doing testing iterations. This allows you to test an "epoch" across a small subset of the 40GB data-file """ self.window_size = 2**int(math.ceil(math.log(int(window_duration * 16000),2))) source_filename = path + '.tar.gz' if not os.path.exists(path): if os.path.exists(source_filename): # Note: this could, in theory overwrite anything on disk, as the Python # tarfile module doesn't prevent writing outside the root directory # (according to its docs). file_ops.untar(source_filename, destination_dir=os.path.dirname(path)) if not os.path.exists(path): raise RuntimeError( "You need to download the TEDLIUM corpus (v2) from %(url)s and save it to %(path)s"%{ 'url': LIUM_BASE + TEDLIUM_DOWNLOAD_URL, 'path': source_filename, } ) path = os.path.realpath(path) log.info("Searching for speeches") self.train_speeches = [ tedlium.Speech( sph, window_size=self.window_size ) for sph in file_ops.find_files( path, '.*[/]train[/]sph[/].*[.]sph', ) ] if max_speeches: self.train_speeches = self.train_speeches[:max_speeches] self.test_speeches = [ tedlium.Speech( sph, window_size=self.window_size ) for sph in file_ops.find_files( path, '.*[/]test[/]sph[/].*[.]sph', ) ] if max_speeches: self.test_speeches = self.test_speeches[:max_speeches] self.valid_speeches = [ tedlium.Speech( sph, window_size=self.window_size ) for sph in file_ops.find_files( path, '.*[/]dev[/]sph[/].*[.]sph', ) ] if max_speeches: self.valid_speeches = self.valid_speeches[:max_speeches] log.info( "Creating speech segments (utterance records using 1/%s of the utterances)", skip_count, ) train_inputs,train_targets = inputs_and_targets( self.train_speeches ) valid_inputs,valid_targets = inputs_and_targets( self.valid_speeches ) test_inputs,test_targets = inputs_and_targets( self.test_speeches ) log.info("Initializing the OpenDeep dataset") super(TEDLIUMDataset,self).__init__( train_inputs=train_inputs,train_targets=train_targets, valid_inputs=valid_inputs,valid_targets=valid_targets, test_inputs=test_inputs,test_targets=test_targets, )