def test1_1(self): ds = Dataset(TESTFILE1) torch.manual_seed(1) # make results based on random weights repeatable wrapper = ModelWrapperDefault(ds) print("\nDEBUG: dataset=", wrapper.dataset, file=sys.stderr) m = wrapper.get_module() wrapper.prepare_data() print("\nDEBUG: module:", m, file=sys.stderr) (loss, acc) = wrapper.evaluate(wrapper.valset, train_mode=False, as_pytorch=False) assert acc < 0.7 print("\nDEBUG: test1_1 before training loss/acc=%s/%s" % (loss, acc), file=sys.stderr) if SLOW_TESTS: wrapper.train(batch_size=20, max_epochs=60, early_stopping=False) (loss, acc) = wrapper.evaluate(wrapper.valset, train_mode=False, as_pytorch=False) assert acc > 0.8 print("\nDEBUG: test1_1 after training loss/acc=%s/%s" % (loss, acc), file=sys.stderr)
def init_after_load(self, filenameprefix, cuda=None): """ If cuda is not None, try to load the module directly to cpu or cuda, as requested. If cuda is None, let pytorch decide what to do. """ logging.captureWarnings(True) logger = logging.getLogger() logger.setLevel(logging.CRITICAL) self.dataset = Dataset(self.metafile) self.init_from_dataset() if cuda is None: self.module = torch.load(filenameprefix + ".module.pytorch") else: if cuda: device = torch.device("cuda") else: device = torch.device("cpu") self.module = torch.load(filenameprefix + ".module.pytorch", map_location=str(device)) # make doubly sure self.module.to(device) self.set_cuda(cuda) self.is_data_prepared = False self.valset = None
logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) streamhandler = logging.StreamHandler() formatter = logging.Formatter( '%(asctime)s %(name)-12s %(levelname)-8s %(message)s') streamhandler.setFormatter(formatter) logger.addHandler(streamhandler) filehandler = logging.FileHandler(__name__ + ".log") logger.addHandler(filehandler) TESTDIR = os.path.join(os.path.dirname(__file__), '.') DATADIR = os.path.join(TESTDIR, 'data') TESTFILE1 = os.path.join(DATADIR, "class-ionosphere.meta.json") ds = Dataset(TESTFILE1) ds_info = ds.get_info() logger.info("META: %r" % ds_info) nFeatures = ds_info["nFeatures"] # we know they are all numeric!! nClasses = ds_info["nClasses"] hidden = int(math.sqrt(nFeatures)) class MyModel(nn.Module): def __init__(self): super(MyModel, self).__init__() # first figure out how many inputs we need and also configure the mapper objects for them # 1) if there is at least one numeric or binary input we create a linear+nonlin layer for all of them # The mapper object concatenates those features and converts to FloatTensor variables
def main(sysargs): logger.debug("Called with args=%s" % (sysargs,)) parser = argparse.ArgumentParser() parser.add_argument("--embs", type=str, help="Override embedding settings, specify as embid:embdims:embtrain:embminfreq:embfile,embid:embdims ..") parser.add_argument("--maxlens", type=str, help="Override maxlen/shorten, specify as attrnr:maxlen:shorten,attrnr:maxlen:shorten (empty attrnr: all)") parser.add_argument("--valsize", type=float, help="Set the validation set size (>1) or proportion (<1)") parser.add_argument("--valeverybatches", type=int, default=None, help="Evaluate on validation set and log every that many batches (None)") parser.add_argument("--valeveryepochs", type=int, default=1, help="Evaluate on validation set and log every that many epochs (1)") parser.add_argument("--valeveryinstances", type=int, default=None, help="Evaluate on validation set and log every that many instances (None)") parser.add_argument("--repeveryinstances", type=int, default=500, help="Report on training set and log every that many instances (500)") parser.add_argument("--repeverybatches", type=int, default=None, help="Report on training set and log every that many batches (None)") parser.add_argument("--batchsize", type=int, default=32, help="Batch size") parser.add_argument("--maxepochs", type=int, default=50, help="Maximum number of epochs") parser.add_argument("--stopfile", type=str, help="If that file exists, training is stopped") parser.add_argument("--module", type=str, help="The class/file name to use for the pytorch module (within modelzoo)") parser.add_argument("--wrapper", type=str, help="The class/file name to use as the model wrapper") parser.add_argument("--learningrate", type=float, help="Override default learning rate for the optimizer") parser.add_argument("--ngram_layer", type=str, default="cnn", help="Architecture to use for ngrams: lstm or cnn (cnn)") parser.add_argument("--es_patience", type=int, default=2, help="Early stopping patience iterations (2)") parser.add_argument("--es_metric", type=str, default="loss", help="Which metric to use for early stopping, 'loss' or 'accuracy' (loss)") parser.add_argument("--elmo", type=str, default=None, help="Use elmo model for embedding, specify path to elmo model") parser.add_argument("--cuda", type=utils.str2bool, help="True/False to use CUDA or not, omit to determine automatically") parser.add_argument("--seed", type=int, default=0, help="Random seed to make experiments repeatable/explore randomness (default 0=random random seed)") parser.add_argument("--noshuffle", action="store_true", help="Prevent shuffling of the dataset (False)") # NOTE: resume currently does not make sure that the original metafile info is used (but maybe new data): # This should work once the metadata is actually stored as part of the model! parser.add_argument("--resume", action='store_true', help="Resume training from the specified model") parser.add_argument("--notrain", action='store_true', help="Do not actually run training, but show generated model") parser.add_argument("--nocreate", action='store_true', help="Do not actually even create module (do nothing)") parser.add_argument("--valfile", type=str, default=None, help="Use this file for validation") parser.add_argument("--version", action='version', version=gatelfpytorchjson.__version__) parser.add_argument("--debug", action='store_true', help="Set logger to DEBUG and show more information") parser.add_argument("metafile", help="Path to metafile (REQUIRED)") parser.add_argument("modelname", help="Model path prefix (full path and beginning of model file name) (REQUIRED)") args = parser.parse_args(args=sysargs[1:]) if args.es_metric not in ["loss", "accuracy"]: raise Exception("es_metric must be loss or accuracy") metafile = args.metafile modelname = args.modelname if not metafile or not modelname: raise Exception("Metafile or modelfile not specified, use --help parameter for help\nLearningFramework defaults are: crvd.meta.json FileJsonPyTorch.model") datadir = str(Path(metafile).parent) config = vars(args) if config.get("debug"): logger.setLevel(logging.DEBUG) es_patience = config.get("es_patience") es_mindelta = 0.0 def es_lambda(losses=None, accs=None, patience=None, mindelta=None, metric="loss"): return ModelWrapper.\ early_stopping_checker(losses, accs, patience=es_patience, mindelta=es_mindelta, metric=config["es_metric"]) # Also log to a file filehandler = logging.FileHandler(os.path.join(datadir, "pytorch-json.train.log")) filehandler.setFormatter(formatter) # in order to override the logging level of any of the modules/classes used, # get the logger and do it here # logger1 = logging.getLogger("gatelfpytorchjson.modelwrapperdefault") # logger1.setLevel(logging.DEBUG) # TODO: use a static Dataset method to parse the remaining args and create an args # dict to pass to the constructors of Dataset and the wrapper so each can pick the # parameters relevant to them! logger.debug("Running train.py, config is %r" % config) if config.get("nocreate"): logger.info("--nocreate specified, exiting") sys.exit(0) logger.debug("Loading metafile...") ds = Dataset(metafile, config=config) logger.debug("Metafile loaded.") # determine and use the correct modelwrapper # default is ModelWrapperSimple wrapper_class = ModelWrapperDefault if config.get("wrapper"): wrapperclassname = config["wrapper"] import importlib module = importlib.import_module("gatelfpytorchjson." + wrapperclassname) wrapper_class_ = getattr(module, wrapperclassname) # TODO: test passing on parameters if config.get("resume"): logger.info("--resume specified, loading and continuing on existing model") wrapper = wrapper_class.load(modelname, metafile=metafile) logger.debug("Modelwrapper loaded") logger.debug("Model is %r" % wrapper) else: logger.debug("Creating ModelWrapperSimple") wrapper = wrapper_class(ds, config=config) logger.debug("Modelwrapper created") logger.debug("Model is %r" % wrapper) if config.get("notrain"): logger.info("--notrain specified, exiting") sys.exit(0) if config.get("debug"): glf = getattr(wrapper, "get_logger", None) if glf and callable(glf): wlogger = wrapper.get_logger() logger.debug("Setting wrapper logging level to DEBUG") wlogger.setLevel(logging.DEBUG) else: logger.debug("Wrapper has not logging, cannot set to DEBUG") # TODO: the default to use for validation set size should be settable through config in the constructor! logger.debug("Preparing the data...") # if we have a validation file, use it, ignore the valsize if config.get("valfile"): wrapper.prepare_data(file=config["valfile"]) else: valsize = config.get("valsize") if valsize is not None: wrapper.prepare_data(validationsize=valsize) else: wrapper.prepare_data() logger.debug("Data prepared") wrapper.validate_every_batches = config["valeverybatches"] wrapper.validate_every_epochs = config["valeveryepochs"] wrapper.validate_every_instances = config["valeveryinstances"] wrapper.report_every_instances = config["repeveryinstances"] wrapper.report_every_batches = config["repeverybatches"] # TODO: figure out what good defaults are here and what we want to set here rather than # in the constructor. Maybe allow to set everything in the constructor for simplicity? logger.info("Model: %r" % wrapper) logger.debug("Start training...") wrapper.train(batch_size=config["batchsize"], early_stopping=es_lambda, max_epochs=config["maxepochs"], filenameprefix=modelname ) logger.debug("Training completed") # NOTE: this will save the modelwrapper, and will ONLY save the model if we did not already # save the best model during training! logger.debug("Saving model...") wrapper.save(modelname) logger.debug("Model saved") # print the model used again so we do not have to scoll back a huge log ... logger.info("Model: %r" % wrapper)
from __future__ import print_function from gatelfdata import Dataset import sys if len(sys.argv) != 2: raise Exception("Need one parameter: meta file") file = sys.argv[1] ds = Dataset(file) valset = ds.convert_to_file() for b in ds.batches_converted(batch_size=20, as_numpy=False, pad_left=True): print("Batch: len=", len(b)) print("Batch: data=", b)
logger.setLevel(logging.DEBUG) formatter = logging.Formatter( '%(asctime)s %(name)-12s %(levelname)-8s %(message)s') streamhandler = logging.StreamHandler() logger.addHandler(streamhandler) # Code to figure out best way to load pre-calculated embeddings fast an efficiently # We need to load embeddings in a way where only those embeddings are loaded which # occur also in our own vocabulary. process = psutil.Process(os.getpid()) mem_rss0, mem_vms0 = process.memory_info()[0:2] memusage0 = memory_usage(-1, interval=1, timeout=1)[0] print("Before loading dataset: Memory RSS/VMS=%s/%s // %s" % (mem_rss0, mem_vms0, memusage0)) print("System memory", psutil.virtual_memory()) ds = Dataset(TESTFILE3) mem_rss1, mem_vms1 = process.memory_info()[0:2] memusage1 = memory_usage(-1, interval=1, timeout=1)[0] print("After loading dataset: Memory RSS/VMS // Mem=%s/%s // %s" % (mem_rss1, mem_vms1, memusage1)) print("After loading dataset: diffs RSS/VMS // Mem=%s/%s // %s" % (mem_rss1-mem_rss0, mem_vms1-mem_vms0, memusage1-memusage0)) print("Dataset vocabs: ", ds.vocabs) vtoken = ds.vocabs.get_vocab("token") print("Token vocab: ", vtoken) # test our simple approach to loading embeddings. We expect a link or copy of glove.6B.50d.txt.gz in the tests/data # directory for this # emb_file = "tests/data/glove.6B.50d.txt.gz" # emb_file = "tests/data/wiki.en.vec" emb_file = sys.argv[1] stoi = vtoken.stoi
# Set up logging logger = logging.getLogger("gatelfdata") logger.setLevel(logging.ERROR) logger = logging.getLogger("gatelfkerasjson") logger.setLevel(logging.DEBUG) streamhandler = logging.StreamHandler(stream=sys.stderr) formatter = logging.Formatter( '%(asctime)s %(name)-12s %(levelname)-8s %(message)s') streamhandler.setFormatter(formatter) logger.addHandler(streamhandler) filehandler = logging.FileHandler( os.path.join(datadir, "FileJsonKerasWrapper.train.log")) logger.addHandler(filehandler) # restore the wrapper ds = Dataset(metafile, targets_need_padding=False) wrapper = KerasWrapperImpl1(ds) wrapper.loadModel(modelprefix) with sys.stdin as infile: for line in infile: #! print("PYTHON FileJsonKeras APPLICATION, input=",line,file=sys.stderr) if line == "STOP": break # TODO: currently the LF sends individual instances here, we may want to change # However we need to always apply to a set of instances, so wrap into another array instancedata = json.loads(line) # TODO: better error handling: put the apply call into a try block and catch any error, also # check returned data. If there is a problem send back in the map we return!! # NOTE: the LF expects to get a map with the following elements: # status: must be "ok", anything else is interpreted as an error
from __future__ import print_function from gatelfdata import Dataset import sys if len(sys.argv) != 2: raise Exception("Need one parameter: meta file") file = sys.argv[1] ds = Dataset(file) valset = ds.convert_to_file() for instance in ds.instances_as_data(): print("Instance: ", instance)
from __future__ import print_function from gatelfdata import Dataset import sys import json if len(sys.argv) != 2: raise Exception("Need one parameter: meta file") file = sys.argv[1] ds = Dataset(file) valset = ds.convert_to_file() it = iter(ds.instances_as_string()) for n in range(20): b = [] print("BATCH: ", n) for i in range(2): print("INSTANCE: ", i) instance = next(it) print("Instance: ", instance) converted = ds.convert_instance(json.loads(instance)) print("Converted: ", converted) b.append(converted) batch1 = ds.reshape_batch(b) print("Size2 batch: ", batch1) print()
def main(sysargs): logger.debug("Called with args=%s" % (sysargs,)) myconfig = ModelWrapperDefault.configsimple(topconfig) myconfig.parse_args(args=sysargs[1:]) if myconfig.get("es_metric") not in ["loss", "accuracy"]: raise Exception("es_metric must be loss or accuracy") metafile = myconfig.get("metafile") modelname = myconfig.get("modelname") if not metafile or not modelname: raise Exception("Metafile or modelfile not specified, use --help parameter for help\nLearningFramework defaults are: crvd.meta.json FileJsonPyTorch.model") datadir = str(Path(metafile).parent) if myconfig.get("debug"): logger.setLevel(logging.DEBUG) es_patience = myconfig.get("es_patience") es_mindelta = 0.0 def es_lambda(losses=None, accs=None, patience=None, mindelta=None, metric="loss"): return ModelWrapper.\ early_stopping_checker(losses, accs, patience=es_patience, mindelta=es_mindelta, metric=myconfig["es_metric"]) # Also log to a file filehandler = logging.FileHandler(os.path.join(datadir, "pytorch-json.train.log")) filehandler.setFormatter(formatter) # in order to override the logging level of any of the modules/classes used, # get the logger and do it here # logger1 = logging.getLogger("gatelfpytorchjson.modelwrapperdefault") # logger1.setLevel(logging.DEBUG) # TODO: use a static Dataset method to parse the remaining args and create an args # dict to pass to the constructors of Dataset and the wrapper so each can pick the # parameters relevant to them! logger.debug("Running train.py, config is %r" % myconfig) if myconfig.get("nocreate"): logger.info("--nocreate specified, exiting") sys.exit(0) logger.debug("Loading metafile...") ds = Dataset(metafile, config=myconfig) logger.debug("Metafile loaded.") # determine and use the correct modelwrapper # default is ModelWrapperSimple wrapper_class = ModelWrapperDefault if myconfig.get("wrapper"): wrapperclassname = myconfig["wrapper"] import importlib module = importlib.import_module("gatelfpytorchjson." + wrapperclassname) wrapper_class_ = getattr(module, wrapperclassname) # TODO: test passing on parameters if myconfig.get("resume"): logger.info("--resume specified, loading and continuing on existing model") wrapper = wrapper_class.load(modelname, metafile=metafile) logger.debug("Modelwrapper loaded") logger.debug("Model is %r" % wrapper) else: logger.debug("Creating ModelWrapperSimple") wrapper = wrapper_class(ds, config=myconfig) logger.debug("Modelwrapper created") logger.debug("Model is %r" % wrapper) if myconfig.get("notrain"): logger.info("--notrain specified, exiting") sys.exit(0) if myconfig.get("debug"): glf = getattr(wrapper, "get_logger", None) if glf and callable(glf): wlogger = wrapper.get_logger() logger.debug("Setting wrapper logging level to DEBUG") wlogger.setLevel(logging.DEBUG) else: logger.debug("Wrapper has not logging, cannot set to DEBUG") # TODO: the default to use for validation set size should be settable through config in the constructor! logger.debug("Preparing the data...") # if we have a validation file, use it, ignore the valsize if myconfig.get("valfile"): wrapper.prepare_data(file=myconfig["valfile"]) else: valsize = myconfig.get("valsize") if valsize is not None: wrapper.prepare_data(validationsize=valsize) else: wrapper.prepare_data() logger.debug("Data prepared") wrapper.validate_every_batches = myconfig["valeverybatches"] wrapper.validate_every_epochs = myconfig["valeveryepochs"] wrapper.validate_every_instances = myconfig["valeveryinstances"] wrapper.report_every_instances = myconfig["repeveryinstances"] wrapper.report_every_batches = myconfig["repeverybatches"] # TODO: figure out what good defaults are here and what we want to set here rather than # in the constructor. Maybe allow to set everything in the constructor for simplicity? logger.info("Model: %r" % wrapper) logger.debug("Start training...") wrapper.train(batch_size=myconfig["batchsize"], early_stopping=es_lambda, max_epochs=myconfig["maxepochs"], filenameprefix=modelname ) logger.debug("Training completed") # NOTE: this will save the modelwrapper, and will ONLY save the model if we did not already # save the best model during training! logger.debug("Saving model...") wrapper.save(modelname) logger.debug("Model saved") # print the model used again so we do not have to scoll back a huge log ... logger.info("Model: %r" % wrapper)