def __init__(self, filename): """ Initialize a FileService and create train, valid, and test filenames from the given base filename. If a file exists with the same name, delete it. Parameters ---------- filename : str Base filepath to use for the train, valid, and test files. """ assert isinstance( filename, string_types ), "input filename needs to be a string, found %s" % str( type(filename)) self.value_separator = os.linesep filename = os.path.realpath(filename) basedir = os.path.dirname(filename) mkdir_p(basedir) # create the appropriate train, valid, test versions of the file name = os.path.basename(filename) name, ext = os.path.splitext(name) self.train_filename = os.path.join(basedir, name + '_train' + ext) self.valid_filename = os.path.join(basedir, name + '_valid' + ext) self.test_filename = os.path.join(basedir, name + '_test' + ext) # delete the files if they already exist if os.path.exists(self.train_filename): os.remove(self.train_filename) if os.path.exists(self.value_separator): os.remove(self.valid_filename) if os.path.exists(self.test_filename): os.remove(self.test_filename)
def ensure_downloads(url=DATA_URL,target_dir=DEFAULT_CODEGOLF_DATASET_PATH): """Ensure that all of the given files have been downloaded and/or unpacked""" file_ops.mkdir_p( target_dir ) expected = os.path.join( target_dir, 'train','yes0.wav') if not os.path.exists( expected ): archive = os.path.join( target_dir, TAR_FILE ) if not os.path.exists( archive ) or os.stat( archive ).st_size != FILE_SIZE: log.info("Downloading codegolf dataset to %s", target_dir ) if not file_ops.download_file( DATA_URL, archive, ): raise RuntimeError( "Unable to download %s to %s"%( DATA_URL, archive, )) if sys.version_info.major == 3: log.info("Using Python 3.x lzma support to unpack") file_ops.untar(archive, target_dir, mode='r:xz') else: log.warn("Attempting decompresion/unpacking via tar command" ) subprocess.check_call( ['tar', '-xJf', archive]) if not os.path.exists( expected ): raise RuntimeError("Untarring the source file did not create %s"%(expected,)) log.info("CodeGolf Yes/No dataset is installed in %s"%(target_dir,)) return True
def __init__(self, filename): """ Initialize a FileService and create empty train, valid, and test files from the given base filename. Parameters ---------- filename : str Base filepath to use for the train, valid, and test files. """ assert isinstance(filename, string_types), "input filename needs to be a string, found %s" % str(type(filename)) self.value_separator = os.linesep filename = os.path.realpath(filename) basedir = os.path.dirname(filename) mkdir_p(basedir) # create the appropriate train, valid, test versions of the file name = os.path.basename(filename) name, ext = os.path.splitext(name) self.train_filename = os.path.join(basedir, name+'_train'+ext) self.valid_filename = os.path.join(basedir, name+'_valid'+ext) self.test_filename = os.path.join(basedir, name+'_test'+ext) # init the files to be empty with open(self.train_filename, 'wb') as f: f.write('') with open(self.valid_filename, 'wb') as f: f.write('') with open(self.test_filename, 'wb') as f: f.write('')
def __init__(self, filename): """ Initialize a FileService and create train, valid, and test filenames from the given base filename. If a file exists with the same name, delete it. Parameters ---------- filename : str Base filepath to use for the train, valid, and test files. """ assert isinstance(filename, string_types), "input filename needs to be a string, found %s" % str(type(filename)) self.value_separator = os.linesep filename = os.path.realpath(filename) basedir = os.path.dirname(filename) mkdir_p(basedir) # create the appropriate train, valid, test versions of the file name = os.path.basename(filename) name, ext = os.path.splitext(name) self.train_filename = os.path.join(basedir, name+'_train'+ext) self.valid_filename = os.path.join(basedir, name+'_valid'+ext) self.test_filename = os.path.join(basedir, name+'_test'+ext) # delete the files if they already exist if os.path.exists(self.train_filename): os.remove(self.train_filename) if os.path.exists(self.value_separator): os.remove(self.valid_filename) if os.path.exists(self.test_filename): os.remove(self.test_filename)
def __init__(self, filename): """ Initialize a FileService and create empty train, valid, and test files from the given base filename. Parameters ---------- filename : str Base filepath to use for the train, valid, and test files. """ assert isinstance( filename, string_types ), "input filename needs to be a string, found %s" % str( type(filename)) self.value_separator = os.linesep filename = os.path.realpath(filename) basedir = os.path.dirname(filename) mkdir_p(basedir) # create the appropriate train, valid, test versions of the file name = os.path.basename(filename) name, ext = os.path.splitext(name) self.train_filename = os.path.join(basedir, name + '_train' + ext) self.valid_filename = os.path.join(basedir, name + '_valid' + ext) self.test_filename = os.path.join(basedir, name + '_test' + ext) # init the files to be empty with open(self.train_filename, 'wb') as f: f.write('') with open(self.valid_filename, 'wb') as f: f.write('') with open(self.test_filename, 'wb') as f: f.write('')
def ensure_downloads(files,base_url=BASE_URL,target_dir=DEFAULT_LIBRISPEECH_DATASET_PATH): """Ensure that all of the given files have been downloaded and/or unpacked""" log.info("Downloading librispeech to %s", target_dir ) file_ops.mkdir_p( target_dir ) for filename in files: final_filename = os.path.join( target_dir, filename ) log.info("Ensuring download: %s", final_filename) filesize = FILE_SIZES.get( filename, 'Unknown Size') size_desc = file_ops.human_bytes(filesize) if isinstance(filesize,(long,int)) else filesize if filename in DIRECTORY_NAMES: without_extension = os.path.join( target_dir, DIRECTORY_NAMES[filename]) else: without_extension = final_filename[:-7] if not os.path.exists( without_extension ): if (not os.path.exists( final_filename )) or not( os.stat(final_filename).st_size == filesize): final_url = base_url + filename log.info("Need to download %s (%s)", final_url,size_desc ) if not file_ops.download_file( final_url, final_filename, ): raise RuntimeError("Unable to download %s to %s"%( final_url,final_filename, )) working = tempfile.mkdtemp(dir=target_dir,prefix="unpack-",suffix="-tmp") try: file_ops.untar(final_filename, working) text_files = [] for name in glob.glob(os.path.join(working,'LibriSpeech','*')): if os.path.basename( name ) == os.path.basename(without_extension): os.rename( name, without_extension ) elif os.path.splitext(name)[1].upper() == '.TXT': text_files.append( name ) else: log.warn("Unexpected directory in %s: %r",final_filename, name) for text_file in text_files: os.rename( text_file, os.path.join( without_extension, os.path.basename(text_file))) if not os.path.exists( without_extension ): raise RuntimeError( "Unable to find the directory %s expected from %s"%( without_extension, final_filename, ) ) finally: shutil.rmtree( working )
def setUp(self): # create test directories and files self.base = "test_filestream_dir/" train = self.base + "train/" valid = self.base + "valid/" test = self.base + "test/" mkdir_p(train) mkdir_p(valid) mkdir_p(test) # some files with open(train + "train1.txt", "w") as f: f.write("TRAIN1a\ntrain1b\n\n") with open(train + "train2.txt", "w") as f: f.write("TRAIN2a\ntrain2b\n\n") with open(train + "train3.md", "w") as f: f.write("TRAIN3a\ntrain3b\n\n") with open(valid + "valid1.txt", "w") as f: f.write("valid1a\nvalid1b\n\n") with open(valid + "valid2.txt", "w") as f: f.write("valid2a\nvalid2b\n\n") with open(valid + "valid3.md", "w") as f: f.write("valid3a\nvalid3b\n\n") with open(test + "test1.txt", "w") as f: f.write("test1a\ntest1b\n\n") with open(test + "test2.txt", "w") as f: f.write("test2a\ntest2b\n\n") with open(test + "test3.md", "w") as f: f.write("test3a\ntest3b\n\n")
def setUp(self): # create test directories and files self.base = "test_filestream_dir/" train = self.base+"train/" valid = self.base+"valid/" test = self.base+"test/" mkdir_p(train) mkdir_p(valid) mkdir_p(test) # some files with open(train+"train1.txt", "w") as f: f.write("TRAIN1a\ntrain1b\n\n") with open(train + "train2.txt", "w") as f: f.write("TRAIN2a\ntrain2b\n\n") with open(train + "train3.md", "w") as f: f.write("TRAIN3a\ntrain3b\n\n") with open(valid + "valid1.txt", "w") as f: f.write("valid1a\nvalid1b\n\n") with open(valid + "valid2.txt", "w") as f: f.write("valid2a\nvalid2b\n\n") with open(valid + "valid3.md", "w") as f: f.write("valid3a\nvalid3b\n\n") with open(test + "test1.txt", "w") as f: f.write("test1a\ntest1b\n\n") with open(test + "test2.txt", "w") as f: f.write("test2a\ntest2b\n\n") with open(test + "test3.md", "w") as f: f.write("test3a\ntest3b\n\n")
def setUp(self): # get a logger for this session self.log = logging.getLogger(__name__) self.dir = "filedataset_test_files" self.single_file_dir = os.path.join(self.dir, "target_in_file") mkdir_p(self.single_file_dir) # create files self.data0 = [[1,2,3,4,5],[2,3,4,5,6]] self.data1 = [[6,7,8,9,0],[7,8,9,0,1]] with open(os.path.join(self.single_file_dir, "1.txt"), 'w') as f: f.write("1,2,3,4,5\t0\n6,7,8,9,0\t1") with open(os.path.join(self.single_file_dir, "2.txt"), 'w') as f: f.write("2,3,4,5,6\t0\n7,8,9,0,2\t1") self.data_files = os.path.join(self.dir, "target_in_filename") mkdir_p(self.data_files) # create files self.cat = "I am a feline! \nMeow." self.dog = "I am a canine! \nWoof." with open(os.path.join(self.data_files, "cat.txt"), 'w') as f: f.write(self.cat) with open(os.path.join(self.data_files, "dog.txt"), 'w') as f: f.write(self.dog)
def __init__(self, inputs_hook=None, hiddens_hook=None, params_hook=None, input_size=None, output_size=None, outdir=None, **kwargs): """ Initialize a new Model. Your model implementations should accept optional inputs_hook and hiddens_hook (if applicable) to set your inputs and hidden representation in a modular fashion, allowing models to link together. inputs_hook is a tuple of (shape, variable) that should replace the default model inputs. hiddens_hook is a tuple of (shape, variable) that should replace the default model hidden representation (which means you need to adapt creating your computation graph to not care about the inputs and to instead run outputs directly from the hidden variable provided). You can also accept a params_hook to share model parameters rather than instantiate a new set of parameters. Parameters ---------- inputs_hook : Tuple of (shape, variable) Routing information for the model to accept inputs from elsewhere. This is used for linking different models together (e.g. setting the Softmax model's input layer to the DAE's hidden layer gives a newly supervised classification model). For now, it needs to include the shape information (normally the dimensionality of the input i.e. n_in). hiddens_hook : Tuple of (shape, variable) Routing information for the model to accept its hidden representation from elsewhere. This is used for linking different models together (e.g. setting the GSN model's hidden layers to the RNN's output layer gives the RNN-GSN model, a deep recurrent model.) For now, it needs to include the shape information (normally the dimensionality of the hiddens i.e. n_hidden). params_hook : List(theano shared variable) A list of model parameters (shared theano variables) that you should use when constructing this model (instead of initializing your own shared variables). This parameter is useful when you want to have two versions of the model that use the same parameters - such as a training model with dropout applied to layers and one without for testing, where the parameters are shared between the two. input_size : int or shape tuple The dimensionality of the input for this model. This is required for stacking models automatically - where the input to one layer is the output of the previous layer. output_size : int or shape tuple The dimensionality of the output for this model. This is required for stacking models automatically - where the input to one layer is the output of the previous layer. Currently, we cannot run the size from Theano's graph, so it needs to be explicit. outdir : str The directory you want outputs (parameters, images, etc.) to save to. If None, nothing will be saved. kwargs : dict This will be all the other left-over keyword parameters passed to the class as a dictionary of {param: value}. These get created into `self.args` along with outdir and output_size. """ log.info("Creating a new instance of %s", str(type(self))) # Necessary inputs to a Model - these are the minimum requirements for modularity to work. self.inputs_hook = inputs_hook self.hiddens_hook = hiddens_hook self.params_hook = params_hook self.input_size = input_size self.output_size = output_size self.outdir = outdir # make sure outdir ends in a directory separator if self.outdir and self.outdir[-1] != os.sep: self.outdir += os.sep # Combine arguments that could specify input_size -> overwrite input_size with inputs_hook[0] if it exists. if self.inputs_hook and self.inputs_hook[0] is not None: self.input_size = self.inputs_hook[0] # Check if the input_size wasn't provided - if this is the case, it could either be a programmer's error # or it could be during the automatic stacking in a Container. Since that is a common use case, set # the input_size to 1 to avoid errors when instantiating the model. if not self.input_size: # Could be error, or more commonly, when adding models to a Container log.warning("No input_size or inputs_hook! Make sure this is done in a Container. Setting input_size" "=1 for the Container now...") self.input_size = 1 # Also, check if no output_size was given - this could be the case for generative models. Copy input_size # in that case. if not self.output_size: # Could be an error (hopefully not), so give the warning. log.warning("No output_size given! Make sure this is from a generative model (where output_size is the " "same as input_size. Setting output_size=input_size now...") self.output_size = self.input_size # copy all of the parameters from the class into an args (configuration) dictionary self.args = {} self.args = add_kwargs_to_dict(kwargs.copy(), self.args) self.args['input_size'] = self.input_size self.args['output_size'] = self.output_size # Now create the directory for outputs of the model # set up base path for the outputs of the model during training, etc. self.args['outdir'] = self.outdir if self.args['outdir']: mkdir_p(self.args['outdir']) # log the arguments. log.info("%s self.args: %s", str(type(self)), str(self.args)) # save the arguments. self.save_args()
def __init__(self, inputs=None, hiddens=None, outputs=None, params=None, outdir=None, **kwargs): """ Initialize a new Model. Your model implementations should accept optional inputs and hiddens SharedVariables (if applicable) to set your inputs and hidden representation in a modular fashion, allowing models to link together. inputs can have a tuple of (shape, variable) that should replace the default model inputs. hiddens can have a tuple of (shape, variable) that should replace the default model hidden representation (which means you need to adapt creating your computation graph to not care about the inputs and to instead run outputs directly from the hidden variable provided). You can also accept a params to share model parameters rather than instantiate a new set of parameters. Parameters ---------- inputs : List of [int or shape_tuple or Tuple of (shape, SharedVariable) or None] The dimensionality of the inputs for this model, and/or the routing information for the model to accept inputs from elsewhere. This is used for linking different models together (e.g. setting the Softmax model's input layer to the DAE's hidden layer gives a newly supervised classification model). For now, variable hook tuples need to include the shape information (normally the dimensionality of the inputs i.e. n_in). hiddens : List of [int or shape_tuple or Tuple of (shape, SharedVariable) or None], optional The dimensionality of the hidden representation for this model, and/or the routing information for the model to accept its hidden representation from elsewhere. This is used for linking different models together (e.g. setting the GSN model's hidden layers to the RNN's output layer gives the RNN-GSN model, a deep recurrent model.) For now, variable hook tuples need to include the shape information (normally the dimensionality of the hiddens i.e. n_hidden). outputs : List of [int or shape tuple], optional The dimensionality of the output(s) for this model. This is required for stacking models automatically - where the input to one layer is the output of the previous layer. Currently, we cannot run the size from Theano's graph, so it needs to be explicit. params : Dict(string_name: theano SharedVariable), optional A dictionary of model parameters (shared theano variables) that you should use when constructing this model (instead of initializing your own shared variables). This parameter is useful when you want to have two versions of the model that use the same parameters - such as siamese networks or pretraining some weights. outdir : str, optional The directory you want outputs (parameters, images, etc.) to save to. If None, nothing will be saved. kwargs : dict, optional This will be all the other left-over keyword parameters passed to the class as a dictionary of {param: value}. These get created into `self.args` along with outdir and outputs. """ self._classname = self.__class__.__name__ log.info("Creating a new instance of %s", self._classname) # Necessary inputs to a Model - these are the minimum requirements for modularity to work. self.inputs = raise_to_list(inputs) self.hiddens = raise_to_list(hiddens) self.output_size = raise_to_list(outputs) self.params = params self.outdir = outdir # make the directory to output configuration and parameters from the model if self.outdir: self.outdir = os.path.realpath(self.outdir) mkdir_p(self.outdir) # copy all of the parameters from the class into an args (configuration) dictionary self.args = {} self.args = add_kwargs_to_dict(kwargs.copy(), self.args) self.args['inputs'] = self.inputs self.args['hiddens'] = self.hiddens self.args['output_size'] = self.output_size self.args['params'] = self.params self.args['outdir'] = self.outdir # log the arguments. log.info("%s self.args: %s", self._classname, str(self.args)) # save the arguments. self.save_args() # Boom! Hyperparameters are now dealt with. Take that! # Don't know the position of switches! self.switches_on = None
def __init__(self, inputs=None, hiddens=None, outputs=None, params=None, outdir=None, **kwargs): """ Initialize a new Model. Your model implementations should accept optional inputs and hiddens Theano symbolic expressions or variables (if applicable) to set your inputs and hidden representation in a modular fashion, allowing models to link together. `inputs` can have a tuple of (shape, variable) that should replace the default model inputs. hiddens can have a tuple of (shape, variable) that should replace the default model hidden representation (which means you need to adapt creating your computation graph to not care about the inputs and to instead run outputs directly from the hidden variable provided). You can also accept a params to share model parameters rather than instantiate a new set of parameters. Parameters ---------- inputs : List of [tuple(shape, `Theano.TensorType`) or Model] or None The dimensionality of the inputs for this model, and the routing information for the model to accept inputs from elsewhere. This is used for linking different models together (e.g. setting the Softmax model's input layer to the DAE's hidden layer gives a newly supervised classification model). `shape` will be a monad tuple representing known sizes for each dimension in the `Theano.TensorType`. The length of `shape` should be equal to number of dimensions in `Theano.TensorType`, where the shape element is an integer representing the size for its dimension, or None if the shape isn't known. For example, if you have a matrix with unknown batch size but fixed feature size of 784, `shape` would be: (None, 784). The full form of `inputs` would be: [((None, 784), <TensorType(float32, matrix)>)]. If a :class:`Model` is given as the input, it replaces the tuple with zip(Model.output_size, Model.get_outputs()). hiddens : List of [tuple(shape, `Theano.TensorType`) or shape] or None, optional The dimensionality of the hidden representation for this model, and/or the routing information for the model to accept its hidden representation from elsewhere. This is used for linking different models together (e.g. setting the GSN model's hidden layers to the RNN's output layer gives the RNN-GSN model, a deep recurrent model.) For now, variable hook tuples need to include the shape information (normally the dimensionality of the hiddens i.e. n_hidden). This shape information is the same format as the monad for `inputs`. outputs : List of [int or shape tuple], optional The dimensionality of the output(s) for this model. Shape here is the shape monad described in `inputs`. params : Dict(string_name: theano SharedVariable), optional A dictionary of model parameters (shared theano variables) that you should use when constructing this model (instead of initializing your own shared variables). This parameter is useful when you want to have two versions of the model that use the same parameters - such as siamese networks or pretraining some weights. outdir : str, optional The directory you want outputs (parameters, images, etc.) to save to. If None, nothing will be saved. kwargs : dict, optional This will be all the other left-over keyword parameters passed to the class as a dictionary of {param: value}. These get created into `self.args` along with outdir and outputs. """ self._classname = self.__class__.__name__ log.info("Creating a new instance of %s", self._classname) # Necessary inputs to a Model - these are the minimum requirements for modularity to work. self.inputs = raise_to_list(inputs) if self.inputs is not None: ins = [] # deal with Models or ModifyLayers being passed as an input. for input in self.inputs: if hasattr(input, 'output_size') and hasattr(input, 'get_outputs'): sizes = raise_to_list(input.output_size) outs = raise_to_list(input.get_outputs()) if len(sizes) == 1 and len(sizes) < len(outs): sizes = sizes * len(outs) input = raise_to_list(zip(sizes, outs)) for i in input: ins.append(i) else: ins.append(input) # replace self.inputs self.inputs = ins self.hiddens = raise_to_list(hiddens) self.output_size = raise_to_list(kwargs.get('output_size', outputs)) self.params = params or {} self.outdir = outdir # make the directory to output configuration and parameters from the model if self.outdir: self.outdir = os.path.realpath(self.outdir) mkdir_p(self.outdir) # copy all of the parameters from the class into an args (configuration) dictionary self.args = {} self.args = add_kwargs_to_dict(kwargs.copy(), self.args) self.args['inputs'] = self.inputs self.args['hiddens'] = self.hiddens if self.output_size is not None: self.args['output_size'] = self.output_size self.args['params'] = self.params self.args['outdir'] = self.outdir # log the arguments. log.info("%s self.args: %s", self._classname, str(self.args)) # save the arguments. self.save_args() # Boom! Hyperparameters are now dealt with. Take that! # Don't know the position of switches! self.switches_on = None
def __init__(self, config=None, defaults=None, inputs_hook=None, hiddens_hook=None, params_hook=None, output_size=None, outdir=None, **kwargs): """ This creates the model's combined configuration params from config and defaults into a self.args dictionary-like object (meaning it implements collections.Mapping and you can use self.args.get('parameter') to access something). Further, your model implementations should accept optional inputs_hook and hiddens_hook (if applicable) to set your inputs and hidden representation in a modular fashion, allowing models to link together. inputs_hook is a tuple of (shape, variable) that should replace the default model inputs. hiddens_hook is a tuple of (shape, variable) that should replace the default model hidden representation (which means you need to adapt creating your computation graph to not care about the inputs and to instead run outputs directly from the hidden variable provided). You can also accept a params_hook to share model parameters rather than instantiate a new set of parameters. ------------------ :param config: A dictionary-like object containing all the necessary user-defined parameters for the model. This means it either implements collections.Mapping or is a file path to a JSON or YAML configuration file. :type config: collections.Mapping object or String (.json file path or .yaml file path) :param defaults: A dictionary-like object containing all the necessary default parameters for the model. This means it either implements collections.Mapping or is a file path to a JSON or YAML configuration file. :type defaults: collections.Mapping object or String (.json file path or .yaml file path) :param inputs_hook: Routing information for the model to accept inputs from elsewhere. This is used for linking different models together (e.g. setting the Sigmoid model's input layer to the DAE's hidden layer gives a newly supervised classification model). For now, you need to include the shape information (normally the dimensionality of the input i.e. n_in). :type inputs_hook: Tuple of (shape, variable) :param hiddens_hook: Routing information for the model to accept its hidden representation from elsewhere. This is used for linking different models together (e.g. setting the GSN model's hidden layers to the RNN's output layer gives the RNN-GSN model, a deep recurrent model.) For now, you need to include the shape information (normally the dimensionality of the hiddens i.e. n_hidden). :type hiddens_hook: Tuple of (shape, variable) :param params_hook: A list of model parameters (shared theano variables) that you should use when constructing this model (instead of initializing your own shared variables). This parameter is useful when you want to have two versions of the model that use the same parameters - such as a training model with dropout applied to layers and one without for testing, where the parameters are shared between the two. :type params_hook: List(theano shared variable) :param output_size: the dimensionality of the output for this model. This is required for stacking models automatically - where the input to one layer is the output of the previous layer. Currently, we cannot run the size from Theano's graph, so it needs to be explicit. This parameter can be None if it is specified in the default or config dictionaries. :type output_size: int :param outdir: the directory you want outputs (parameters, images, etc.) to save to. :type outdir: string :param kwargs: this will be all the other left-over parameters passed to the class as a dictionary of {param: value}. We will use the kwargs to finally combine defaults, config, and passed parameters together into the self.args dict, making each model's parameters accessible by name in self.args :type kwargs: dict """ log.info("Creating a new instance of %s", str(type(self))) # set self.args to be the combination of the defaults and the config dictionaries self.args = combine_config_and_defaults(config, defaults) # if the args are none, make it a blank dictionary if self.args is None: self.args = {} # now, go through the inputs_hook, hiddens_hook, params_hook, and output_size to add them to self.args # if the variable isn't None, override the argument from config/default. (or add it if it doesn't exist) if inputs_hook is not None or 'inputs_hook' not in self.args: self.args['inputs_hook'] = inputs_hook if hiddens_hook is not None or 'hiddens_hook' not in self.args: self.args['hiddens_hook'] = hiddens_hook if params_hook is not None or 'params_hook' not in self.args: self.args['params_hook'] = params_hook if output_size is not None or 'output_size' not in self.args: self.args['output_size'] = output_size # set the overall default outdir to outputs/ if outdir is not None or 'outdir' not in self.args: self.args['outdir'] = outdir if self.args['outdir'] is None: self.args['outdir'] = 'outputs/' # Now create the directory for outputs of the model # set up base path for the outputs of the model during training, etc. mkdir_p(self.args['outdir']) # now that our required variables are out of the way, do the same thing for everything else passed via kwargs for arg, val in kwargs.items(): if (val is not None or str(arg) not in self.args) and str(arg) != 'kwargs': self.args[str(arg)] = val # flatten kwargs if it was passed as a variable elif str(arg) == 'kwargs': inner_kwargs = kwargs['kwargs'] for key, item in inner_kwargs.items(): if item is not None or str(key) not in self.args: self.args[str(key)] = item # Magic! Now self.args contains the combination of all the initialization variables, overridden like so: # defaults < config < kwargs (explicits passed to model's __init__) # Do a check if both input_size and inputs_hook are None (this should only happen in Prototype) if self.args.get("input_size") is None and self.args.get('inputs_hook') is None: log.warning("Both input_size and inputs_hook are None! Make sure this is only happening in a Prototype! " "Setting input_size to 1 for convenience to the Prototype.") self.args['input_size'] = 1 # Finally, to make things really easy, update the class 'self' with everything in self.args to make # all the parameters accessible via self.<param> self.__dict__.update(self.args) # log the arguments. log.debug("%s self.args: %s", str(type(self)), str(self.args)) # save the arguments. self.save_args()
def install(self): ''' Method to both download and extract the dataset from the internet (if there) or verify connection settings ''' file_type = None if self.filename is not None: log.info('Installing dataset %s', str(self.filename)) # construct the actual path to the dataset prevdir = os.getcwd() os.chdir(os.path.split(os.path.realpath(__file__))[0]) dataset_dir = os.path.realpath(self.dataset_dir) try: mkdir_p(dataset_dir) dataset_location = os.path.join(dataset_dir, self.filename) except Exception as e: log.error("Couldn't make the dataset path with directory %s and filename %s", dataset_dir, str(self.filename)) log.exception("%s", str(e)) dataset_location = None finally: os.chdir(prevdir) # check if the dataset is already in the source, otherwise download it. # first check if the base filename exists - without all the extensions. # then, add each extension on and keep checking until the upper level, when you download from http. if dataset_location is not None: (dirs, fname) = os.path.split(dataset_location) split_fname = fname.split('.') accumulated_name = split_fname[0] found = False # first check if the filename was a directory (like for the midi datasets) if os.path.exists(os.path.join(dirs, accumulated_name)): found = True file_type = get_file_type(os.path.join(dirs, accumulated_name)) dataset_location = os.path.join(dirs, accumulated_name) log.debug('Found file %s', dataset_location) # now go through the file extensions starting with the lowest level and check if the file exists if not found and len(split_fname) > 1: for chunk in split_fname[1:]: accumulated_name = '.'.join((accumulated_name, chunk)) file_type = get_file_type(os.path.join(dirs, accumulated_name)) if file_type is not None: dataset_location = os.path.join(dirs, accumulated_name) log.debug('Found file %s', dataset_location) break # if the file wasn't found, download it if a source was provided. Otherwise, raise error. download_success = True if self.source is not None: if file_type is None: download_success = download_file(self.source, dataset_location) file_type = get_file_type(dataset_location) else: log.error("Filename %s couldn't be found, and no URL source to download was provided.", str(self.filename)) raise RuntimeError("Filename %s couldn't be found, and no URL source to download was provided." % str(self.filename)) # if the file type is a zip, unzip it. unzip_success = True if file_type is files.ZIP: (dirs, fname) = os.path.split(dataset_location) post_unzip = os.path.join(dirs, '.'.join(fname.split('.')[0:-1])) unzip_success = files.unzip(dataset_location, post_unzip) # if the unzip was successful if unzip_success: # remove the zipfile and update the dataset location and file type log.debug('Removing file %s', dataset_location) os.remove(dataset_location) dataset_location = post_unzip file_type = get_file_type(dataset_location) if download_success and unzip_success: log.info('Installation complete. Yay!') else: log.warning('Something went wrong installing dataset. Boo :(') return dataset_location, file_type
def config_root_logger(config_file='logging_config.json'): """ Configures the root logger (returned from get_root_logger()) to the specifications in the JSON file `config_file`. Parameters ---------- config_file : str The string path to the configuration JSON file to use. """ # this could be called from scripts anywhere, but we want to keep the log-related items in this directory. # therefore, change the cwd to this file's directory and then change back at the end. prevdir = os.path.realpath(os.getcwd()) os.chdir(os.path.split(os.path.realpath(__file__))[0]) # load the basic parameters from the JSON configuration file # config_file = os.path.join(os.path.split(os.path.realpath(__file__))[0], config_file) path = config_file env_key = 'LOG_CFG' value = os.getenv(env_key, None) if value: path = value # if the configuration exists init = True if os.path.exists(path): with open(path, 'rt') as f: try: config = json.load(f) except: logging.basicConfig(level=logging.DEBUG) logger = get_root_logger() logger.exception( 'Exception in reading the JSON logging config file!') logger.warning( 'Anyway, loading the basicConfig for the logger instead.') init = False if init: # make the file paths to the log files for handler in config.get('handlers', None): if handler is not None: path = config.get('handlers').get(handler).get('filename') if path is not None: path = os.path.normpath(path) (dirs, _) = os.path.split(path) if len(dirs) is not 0: # dirs = os.path.join(os.path.split(os.path.realpath(__file__))[0], dirs) try: mkdir_p(dirs) except: logging.basicConfig(level=logging.DEBUG) logger = get_root_logger() logger.exception( 'Exception in creating the directory for a logging handler! ' 'Path was {0!s}'.format( os.path.realpath(dirs))) logger.warning( 'Anyway, loading the basicConfig for the logger instead.' ) init = False # load the configuration into the logging module if init: try: logging.config.dictConfig(config) except: logging.basicConfig(level=logging.DEBUG) logger = get_root_logger() logger.exception( 'Exception in loading the JSON logging config file to the logging module!' ) logger.warning( 'Anyway, loading the basicConfig for the logger instead.' ) # otherwise, couldn't find the configuration file else: logging.basicConfig(level=logging.DEBUG) logger = get_root_logger() logger.warning( "Could not find configuration file for logger! Was looking for {0!s}. " "Using basicConfig instead...".format(os.path.realpath(path))) # change the directory to the calling file's working directory os.chdir(prevdir)
def install(self): ''' Method to both download and extract the dataset from the internet (if applicable) or verify that the file exists in the dataset_dir. Returns ------- str The absolute path to the dataset location on disk. int The integer representing the file type for the dataset, as defined in the opendeep.utils.file_ops module. ''' file_type = None if self.filename is not None: log.info('Installing dataset %s', str(self.filename)) # construct the actual path to the dataset prevdir = os.getcwd() os.chdir(os.path.split(os.path.realpath(__file__))[0]) dataset_dir = os.path.realpath(self.dataset_dir) try: mkdir_p(dataset_dir) dataset_location = os.path.join(dataset_dir, self.filename) except Exception as e: log.error("Couldn't make the dataset path with directory %s and filename %s", dataset_dir, str(self.filename)) log.exception("%s", str(e)) dataset_location = None finally: os.chdir(prevdir) # check if the dataset is already in the source, otherwise download it. # first check if the base filename exists - without all the extensions. # then, add each extension on and keep checking until the upper level, when you download from http. if dataset_location is not None: (dirs, fname) = os.path.split(dataset_location) split_fname = fname.split('.') accumulated_name = split_fname[0] found = False # first check if the filename was a directory (like for the midi datasets) if os.path.exists(os.path.join(dirs, accumulated_name)): found = True file_type = get_file_type(os.path.join(dirs, accumulated_name)) dataset_location = os.path.join(dirs, accumulated_name) log.debug('Found file %s', dataset_location) # now go through the file extensions starting with the lowest level and check if the file exists if not found and len(split_fname) > 1: for chunk in split_fname[1:]: accumulated_name = '.'.join((accumulated_name, chunk)) file_type = get_file_type(os.path.join(dirs, accumulated_name)) if file_type is not None: dataset_location = os.path.join(dirs, accumulated_name) log.debug('Found file %s', dataset_location) break # if the file wasn't found, download it if a source was provided. Otherwise, raise error. download_success = True if self.source is not None: if file_type is None: download_success = download_file(self.source, dataset_location) file_type = get_file_type(dataset_location) else: log.error("Filename %s couldn't be found, and no URL source to download was provided.", str(self.filename)) raise RuntimeError("Filename %s couldn't be found, and no URL source to download was provided." % str(self.filename)) # if the file type is a zip, unzip it. unzip_success = True if file_type is files.ZIP: (dirs, fname) = os.path.split(dataset_location) post_unzip = os.path.join(dirs, '.'.join(fname.split('.')[0:-1])) unzip_success = files.unzip(dataset_location, post_unzip) # if the unzip was successful if unzip_success: # remove the zipfile and update the dataset location and file type log.debug('Removing file %s', dataset_location) os.remove(dataset_location) dataset_location = post_unzip file_type = get_file_type(dataset_location) if download_success and unzip_success: log.info('Installation complete. Yay!') else: log.warning('Something went wrong installing dataset. Boo :(') return dataset_location, file_type
def __init__(self, inputs_hook=None, hiddens_hook=None, params_hook=None, input_size=None, output_size=None, outdir=None, **kwargs): """ Initialize a new Model. Your model implementations should accept optional inputs_hook and hiddens_hook (if applicable) to set your inputs and hidden representation in a modular fashion, allowing models to link together. inputs_hook is a tuple of (shape, variable) that should replace the default model inputs. hiddens_hook is a tuple of (shape, variable) that should replace the default model hidden representation (which means you need to adapt creating your computation graph to not care about the inputs and to instead run outputs directly from the hidden variable provided). You can also accept a params_hook to share model parameters rather than instantiate a new set of parameters. Parameters ---------- inputs_hook : Tuple of (shape, variable) Routing information for the model to accept inputs from elsewhere. This is used for linking different models together (e.g. setting the Softmax model's input layer to the DAE's hidden layer gives a newly supervised classification model). For now, it needs to include the shape information (normally the dimensionality of the input i.e. n_in). hiddens_hook : Tuple of (shape, variable) Routing information for the model to accept its hidden representation from elsewhere. This is used for linking different models together (e.g. setting the GSN model's hidden layers to the RNN's output layer gives the RNN-GSN model, a deep recurrent model.) For now, it needs to include the shape information (normally the dimensionality of the hiddens i.e. n_hidden). params_hook : List(theano shared variable) A list of model parameters (shared theano variables) that you should use when constructing this model (instead of initializing your own shared variables). This parameter is useful when you want to have two versions of the model that use the same parameters - such as a training model with dropout applied to layers and one without for testing, where the parameters are shared between the two. input_size : int or shape tuple The dimensionality of the input for this model. This is required for stacking models automatically - where the input to one layer is the output of the previous layer. output_size : int or shape tuple The dimensionality of the output for this model. This is required for stacking models automatically - where the input to one layer is the output of the previous layer. Currently, we cannot run the size from Theano's graph, so it needs to be explicit. outdir : str The directory you want outputs (parameters, images, etc.) to save to. If None, nothing will be saved. kwargs : dict This will be all the other left-over keyword parameters passed to the class as a dictionary of {param: value}. These get created into `self.args` along with outdir and output_size. """ log.info("Creating a new instance of %s", str(type(self))) # Necessary inputs to a Model - these are the minimum requirements for modularity to work. self.inputs_hook = inputs_hook self.hiddens_hook = hiddens_hook self.params_hook = params_hook self.input_size = input_size self.output_size = output_size self.outdir = outdir # make sure outdir ends in a directory separator if self.outdir and self.outdir[-1] != os.sep: self.outdir += os.sep # Combine arguments that could specify input_size -> overwrite input_size with inputs_hook[0] if it exists. if self.inputs_hook and self.inputs_hook[0] is not None: self.input_size = self.inputs_hook[0] # Check if the input_size wasn't provided - if this is the case, it could either be a programmer's error # or it could be during the automatic stacking in a Container. Since that is a common use case, set # the input_size to 1 to avoid errors when instantiating the model. if not self.input_size: # Could be error, or more commonly, when adding models to a Container log.warning( "No input_size or inputs_hook! Make sure this is done in a Container. Setting input_size" "=1 for the Container now...") self.input_size = 1 # Also, check if no output_size was given - this could be the case for generative models. Copy input_size # in that case. if not self.output_size: # Could be an error (hopefully not), so give the warning. log.warning( "No output_size given! Make sure this is from a generative model (where output_size is the" "same as input_size. Setting output_size=input_size now...") self.output_size = self.input_size # copy all of the parameters from the class into an args (configuration) dictionary self.args = {} self.args = add_kwargs_to_dict(kwargs.copy(), self.args) self.args['output_size'] = self.output_size # Now create the directory for outputs of the model # set up base path for the outputs of the model during training, etc. self.args['outdir'] = self.outdir if self.args['outdir']: mkdir_p(self.args['outdir']) # log the arguments. log.info("%s self.args: %s", str(type(self)), str(self.args)) # save the arguments. self.save_args()
def __init__(self, inputs=None, hiddens=None, outputs=None, params=None, outdir=None, **kwargs): """ Initialize a new Model. Your model implementations should accept optional inputs and hiddens Theano symbolic expressions or variables (if applicable) to set your inputs and hidden representation in a modular fashion, allowing models to link together. `inputs` can have a tuple of (shape, variable) that should replace the default model inputs. hiddens can have a tuple of (shape, variable) that should replace the default model hidden representation (which means you need to adapt creating your computation graph to not care about the inputs and to instead run outputs directly from the hidden variable provided). You can also accept a params to share model parameters rather than instantiate a new set of parameters. Parameters ---------- inputs : List of [tuple(shape, `Theano.TensorType`) or Model] or None The dimensionality of the inputs for this model, and the routing information for the model to accept inputs from elsewhere. This is used for linking different models together (e.g. setting the Softmax model's input layer to the DAE's hidden layer gives a newly supervised classification model). `shape` will be a monad tuple representing known sizes for each dimension in the `Theano.TensorType`. The length of `shape` should be equal to number of dimensions in `Theano.TensorType`, where the shape element is an integer representing the size for its dimension, or None if the shape isn't known. For example, if you have a matrix with unknown batch size but fixed feature size of 784, `shape` would be: (None, 784). The full form of `inputs` would be: [((None, 784), <TensorType(float32, matrix)>)]. If a :class:`Model` is given as the input, it replaces the tuple with zip(Model.output_size, Model.get_outputs()). hiddens : List of [tuple(shape, `Theano.TensorType`) or shape] or None, optional The dimensionality of the hidden representation for this model, and/or the routing information for the model to accept its hidden representation from elsewhere. This is used for linking different models together (e.g. setting the GSN model's hidden layers to the RNN's output layer gives the RNN-GSN model, a deep recurrent model.) For now, variable hook tuples need to include the shape information (normally the dimensionality of the hiddens i.e. n_hidden). This shape information is the same format as the monad for `inputs`. outputs : List of [int or shape tuple], optional The dimensionality of the output(s) for this model. Shape here is the shape monad described in `inputs`. params : Dict(string_name: theano SharedVariable), optional A dictionary of model parameters (shared theano variables) that you should use when constructing this model (instead of initializing your own shared variables). This parameter is useful when you want to have two versions of the model that use the same parameters - such as siamese networks or pretraining some weights. outdir : str, optional The directory you want outputs (parameters, images, etc.) to save to. If None, nothing will be saved. kwargs : dict, optional This will be all the other left-over keyword parameters passed to the class as a dictionary of {param: value}. These get created into `self.args` along with outdir and outputs. """ self._classname = self.__class__.__name__ log.info("Creating a new instance of %s", self._classname) # Necessary inputs to a Model - these are the minimum requirements for modularity to work. self.inputs = raise_to_list(inputs) if self.inputs is not None: ins = [] # deal with Models or ModifyLayers being passed as an input. for input in self.inputs: if hasattr(input, 'output_size') and hasattr( input, 'get_outputs'): sizes = raise_to_list(input.output_size) outs = raise_to_list(input.get_outputs()) if len(sizes) == 1 and len(sizes) < len(outs): sizes = sizes * len(outs) input = raise_to_list(zip(sizes, outs)) for i in input: ins.append(i) else: ins.append(input) # replace self.inputs self.inputs = ins self.hiddens = raise_to_list(hiddens) self.output_size = raise_to_list(kwargs.get('output_size', outputs)) self.params = params or {} self.outdir = outdir # make the directory to output configuration and parameters from the model if self.outdir: self.outdir = os.path.realpath(self.outdir) mkdir_p(self.outdir) # copy all of the parameters from the class into an args (configuration) dictionary self.args = {} self.args = add_kwargs_to_dict(kwargs.copy(), self.args) self.args['inputs'] = self.inputs self.args['hiddens'] = self.hiddens if self.output_size is not None: self.args['output_size'] = self.output_size self.args['params'] = self.params self.args['outdir'] = self.outdir # log the arguments. log.info("%s self.args: %s", self._classname, str(self.args)) # save the arguments. self.save_args() # Boom! Hyperparameters are now dealt with. Take that! # Don't know the position of switches! self.switches_on = None
def config_root_logger(config_file='logging_config.json'): """ Configures the root logger (returned from get_root_logger()) to the specifications in the JSON file `config_file`. Parameters ---------- config_file : str The string path to the configuration JSON file to use. """ # this could be called from scripts anywhere, but we want to keep the log-related items in this directory. # therefore, change the cwd to this file's directory and then change back at the end. prevdir = os.path.realpath(os.getcwd()) os.chdir(os.path.split(os.path.realpath(__file__))[0]) # load the basic parameters from the JSON configuration file # config_file = os.path.join(os.path.split(os.path.realpath(__file__))[0], config_file) path = config_file env_key = 'LOG_CFG' value = os.getenv(env_key, None) if value: path = value # if the configuration exists init = True if os.path.exists(path): with open(path, 'rt') as f: try: config = json.load(f) except: logging.basicConfig(level=logging.DEBUG) logger = get_root_logger() logger.exception('Exception in reading the JSON logging config file!') logger.warning('Anyway, loading the basicConfig for the logger instead.') init = False if init: # make the file paths to the log files for handler in config.get('handlers', None): if handler is not None: path = config.get('handlers').get(handler).get('filename') if path is not None: path = os.path.normpath(path) (dirs, _) = os.path.split(path) if len(dirs) is not 0: # dirs = os.path.join(os.path.split(os.path.realpath(__file__))[0], dirs) try: mkdir_p(dirs) except: logging.basicConfig(level=logging.DEBUG) logger = get_root_logger() logger.exception('Exception in creating the directory for a logging handler! ' 'Path was {0!s}'.format(os.path.realpath(dirs))) logger.warning('Anyway, loading the basicConfig for the logger instead.') init = False # load the configuration into the logging module if init: try: logging.config.dictConfig(config) except: logging.basicConfig(level=logging.DEBUG) logger = get_root_logger() logger.exception('Exception in loading the JSON logging config file to the logging module!') logger.warning('Anyway, loading the basicConfig for the logger instead.') # otherwise, couldn't find the configuration file else: logging.basicConfig(level=logging.DEBUG) logger = get_root_logger() logger.warning("Could not find configuration file for logger! Was looking for {0!s}. " "Using basicConfig instead...".format(os.path.realpath(path))) # change the directory to the calling file's working directory os.chdir(prevdir)