Esempio n. 1
0
 def test1_1(self):
     ds = Dataset(TESTFILE1)
     torch.manual_seed(1)  # make results based on random weights repeatable
     wrapper = ModelWrapperDefault(ds)
     print("\nDEBUG: dataset=", wrapper.dataset, file=sys.stderr)
     m = wrapper.get_module()
     wrapper.prepare_data()
     print("\nDEBUG: module:", m, file=sys.stderr)
     (loss, acc) = wrapper.evaluate(wrapper.valset,
                                    train_mode=False,
                                    as_pytorch=False)
     assert acc < 0.7
     print("\nDEBUG: test1_1 before training loss/acc=%s/%s" % (loss, acc),
           file=sys.stderr)
     if SLOW_TESTS:
         wrapper.train(batch_size=20, max_epochs=60, early_stopping=False)
         (loss, acc) = wrapper.evaluate(wrapper.valset,
                                        train_mode=False,
                                        as_pytorch=False)
         assert acc > 0.8
         print("\nDEBUG: test1_1 after training loss/acc=%s/%s" %
               (loss, acc),
               file=sys.stderr)
 def init_after_load(self, filenameprefix, cuda=None):
     """
     If cuda is not None, try to load the module directly to cpu or cuda, as requested.
     If cuda is None, let pytorch decide what to do.
     """
     logging.captureWarnings(True)
     logger = logging.getLogger()
     logger.setLevel(logging.CRITICAL)
     self.dataset = Dataset(self.metafile)
     self.init_from_dataset()
     if cuda is None:
         self.module = torch.load(filenameprefix + ".module.pytorch")
     else:
         if cuda:
             device = torch.device("cuda")
         else:
             device = torch.device("cpu")
         self.module = torch.load(filenameprefix + ".module.pytorch",
                                  map_location=str(device))
         # make doubly sure
         self.module.to(device)
         self.set_cuda(cuda)
     self.is_data_prepared = False
     self.valset = None
Esempio n. 3
0
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
streamhandler = logging.StreamHandler()
formatter = logging.Formatter(
    '%(asctime)s %(name)-12s %(levelname)-8s %(message)s')
streamhandler.setFormatter(formatter)
logger.addHandler(streamhandler)
filehandler = logging.FileHandler(__name__ + ".log")
logger.addHandler(filehandler)

TESTDIR = os.path.join(os.path.dirname(__file__), '.')
DATADIR = os.path.join(TESTDIR, 'data')
TESTFILE1 = os.path.join(DATADIR, "class-ionosphere.meta.json")

ds = Dataset(TESTFILE1)
ds_info = ds.get_info()
logger.info("META: %r" % ds_info)

nFeatures = ds_info["nFeatures"]  # we know they are all numeric!!
nClasses = ds_info["nClasses"]

hidden = int(math.sqrt(nFeatures))


class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        # first figure out how many inputs we need and also configure the mapper objects for them
        # 1) if there is at least one numeric or binary input we create a linear+nonlin layer for all of them
        #    The mapper object concatenates those features and converts to FloatTensor variables
Esempio n. 4
0
def main(sysargs):

    logger.debug("Called with args=%s" % (sysargs,))
    parser = argparse.ArgumentParser()
    parser.add_argument("--embs", type=str, help="Override embedding settings, specify as embid:embdims:embtrain:embminfreq:embfile,embid:embdims ..")
    parser.add_argument("--maxlens", type=str, help="Override maxlen/shorten, specify as attrnr:maxlen:shorten,attrnr:maxlen:shorten (empty attrnr: all)")
    parser.add_argument("--valsize", type=float, help="Set the validation set size (>1) or proportion (<1)")
    parser.add_argument("--valeverybatches", type=int, default=None, help="Evaluate on validation set and log every that many batches (None)")
    parser.add_argument("--valeveryepochs", type=int, default=1, help="Evaluate on validation set and log every that many epochs (1)")
    parser.add_argument("--valeveryinstances", type=int, default=None, help="Evaluate on validation set and log every that many instances (None)")
    parser.add_argument("--repeveryinstances", type=int, default=500, help="Report on training set and log every that many instances (500)")
    parser.add_argument("--repeverybatches", type=int, default=None, help="Report on training set and log every that many batches (None)")
    parser.add_argument("--batchsize", type=int, default=32, help="Batch size")
    parser.add_argument("--maxepochs", type=int, default=50, help="Maximum number of epochs")
    parser.add_argument("--stopfile", type=str, help="If that file exists, training is stopped")
    parser.add_argument("--module", type=str, help="The class/file name to use for the pytorch module (within modelzoo)")
    parser.add_argument("--wrapper", type=str, help="The class/file name to use as the model wrapper")
    parser.add_argument("--learningrate", type=float, help="Override default learning rate for the optimizer")
    parser.add_argument("--ngram_layer", type=str, default="cnn", help="Architecture to use for ngrams: lstm or cnn (cnn)")
    parser.add_argument("--es_patience", type=int, default=2, help="Early stopping patience iterations (2)")
    parser.add_argument("--es_metric", type=str, default="loss", help="Which metric to use for early stopping, 'loss' or 'accuracy' (loss)")
    parser.add_argument("--elmo", type=str, default=None, help="Use elmo model for embedding, specify path to elmo model")
    parser.add_argument("--cuda", type=utils.str2bool, help="True/False to use CUDA or not, omit to determine automatically")
    parser.add_argument("--seed", type=int, default=0, help="Random seed to make experiments repeatable/explore randomness (default 0=random random seed)")
    parser.add_argument("--noshuffle", action="store_true", help="Prevent shuffling of the dataset (False)")
    # NOTE: resume currently does not make sure that the original metafile info is used (but maybe new data):
    # This should work once the metadata is actually stored as part of the model!
    parser.add_argument("--resume", action='store_true', help="Resume training from the specified model")
    parser.add_argument("--notrain", action='store_true', help="Do not actually run training, but show generated model")
    parser.add_argument("--nocreate", action='store_true', help="Do not actually even create module (do nothing)")
    parser.add_argument("--valfile", type=str, default=None, help="Use this file for validation")
    parser.add_argument("--version", action='version', version=gatelfpytorchjson.__version__)
    parser.add_argument("--debug", action='store_true', help="Set logger to DEBUG and show more information")
    parser.add_argument("metafile", help="Path to metafile (REQUIRED)")
    parser.add_argument("modelname", help="Model path prefix (full path and beginning of model file name) (REQUIRED)")

    args = parser.parse_args(args=sysargs[1:])

    if args.es_metric not in ["loss", "accuracy"]:
        raise Exception("es_metric must be loss or accuracy")

    metafile = args.metafile
    modelname = args.modelname

    if not metafile or not modelname:
        raise Exception("Metafile or modelfile not specified, use --help parameter for help\nLearningFramework defaults are: crvd.meta.json FileJsonPyTorch.model")

    datadir = str(Path(metafile).parent)

    config = vars(args)

    if config.get("debug"):
        logger.setLevel(logging.DEBUG)

    es_patience = config.get("es_patience")
    es_mindelta = 0.0
    def es_lambda(losses=None, accs=None, patience=None, mindelta=None, metric="loss"):
        return ModelWrapper.\
            early_stopping_checker(losses, accs, patience=es_patience, mindelta=es_mindelta, metric=config["es_metric"])


    # Also log to a file
    filehandler = logging.FileHandler(os.path.join(datadir, "pytorch-json.train.log"))
    filehandler.setFormatter(formatter)

    # in order to override the logging level of any of the modules/classes used,
    # get the logger and do it here
    # logger1 = logging.getLogger("gatelfpytorchjson.modelwrapperdefault")
    # logger1.setLevel(logging.DEBUG)


    # TODO: use a static Dataset method to parse the remaining args and create an args
    # dict to pass to the constructors of Dataset and the wrapper so each can pick the
    # parameters relevant to them!

    logger.debug("Running train.py, config is %r" % config)

    if config.get("nocreate"):
        logger.info("--nocreate specified, exiting")
        sys.exit(0)

    logger.debug("Loading metafile...")
    ds = Dataset(metafile, config=config)
    logger.debug("Metafile loaded.")

    # determine and use the correct modelwrapper
    # default is ModelWrapperSimple
    wrapper_class = ModelWrapperDefault
    if config.get("wrapper"):
        wrapperclassname = config["wrapper"]
        import importlib
        module = importlib.import_module("gatelfpytorchjson." + wrapperclassname)
        wrapper_class_ = getattr(module, wrapperclassname)

    # TODO: test passing on parameters
    if config.get("resume"):
        logger.info("--resume specified, loading and continuing on existing model")
        wrapper = wrapper_class.load(modelname, metafile=metafile)
        logger.debug("Modelwrapper loaded")
        logger.debug("Model is %r" % wrapper)
    else:
        logger.debug("Creating ModelWrapperSimple")
        wrapper = wrapper_class(ds, config=config)
        logger.debug("Modelwrapper created")
        logger.debug("Model is %r" % wrapper)

    if config.get("notrain"):
        logger.info("--notrain specified, exiting")
        sys.exit(0)

    if config.get("debug"):
        glf = getattr(wrapper, "get_logger", None)
        if  glf and callable(glf):
            wlogger = wrapper.get_logger()
            logger.debug("Setting wrapper logging level to DEBUG")
            wlogger.setLevel(logging.DEBUG)
        else:
            logger.debug("Wrapper has not logging, cannot set to DEBUG")

    # TODO: the default to use for validation set size should be settable through config in the constructor!
    logger.debug("Preparing the data...")
    # if we have a validation file, use it, ignore the valsize
    if config.get("valfile"):
        wrapper.prepare_data(file=config["valfile"])
    else:
        valsize = config.get("valsize")
        if valsize is not None:
            wrapper.prepare_data(validationsize=valsize)
        else:
            wrapper.prepare_data()
    logger.debug("Data prepared")

    wrapper.validate_every_batches = config["valeverybatches"]
    wrapper.validate_every_epochs = config["valeveryepochs"]
    wrapper.validate_every_instances = config["valeveryinstances"]
    wrapper.report_every_instances = config["repeveryinstances"]
    wrapper.report_every_batches = config["repeverybatches"]

    # TODO: figure out what good defaults are here and what we want to set here rather than
    # in the constructor. Maybe allow to set everything in the constructor for simplicity?
    logger.info("Model: %r" % wrapper)
    logger.debug("Start training...")
    wrapper.train(batch_size=config["batchsize"],
                  early_stopping=es_lambda, max_epochs=config["maxepochs"], filenameprefix=modelname
                  )
    logger.debug("Training completed")

    # NOTE: this will save the modelwrapper, and will ONLY save the model if we did not already
    # save the best model during training!
    logger.debug("Saving model...")
    wrapper.save(modelname)
    logger.debug("Model saved")

    # print the model used again so we do not have to scoll back a huge log ...
    logger.info("Model: %r" % wrapper)
from __future__ import print_function
from gatelfdata import Dataset
import sys

if len(sys.argv) != 2:
   raise Exception("Need one parameter: meta file")

file = sys.argv[1]

ds = Dataset(file)

valset = ds.convert_to_file()
for b in ds.batches_converted(batch_size=20, as_numpy=False, pad_left=True):
  print("Batch: len=", len(b))
  print("Batch: data=", b)
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter(
        '%(asctime)s %(name)-12s %(levelname)-8s %(message)s')
streamhandler = logging.StreamHandler()
logger.addHandler(streamhandler)
# Code to figure out best way to load pre-calculated embeddings fast an efficiently
# We need to load embeddings in a way where only those embeddings are loaded which
# occur also in our own vocabulary.

process = psutil.Process(os.getpid())
mem_rss0, mem_vms0 = process.memory_info()[0:2]
memusage0 = memory_usage(-1, interval=1, timeout=1)[0]
print("Before loading dataset: Memory RSS/VMS=%s/%s // %s" % (mem_rss0, mem_vms0, memusage0))
print("System memory", psutil.virtual_memory())

ds = Dataset(TESTFILE3)
mem_rss1, mem_vms1 = process.memory_info()[0:2]
memusage1 = memory_usage(-1, interval=1, timeout=1)[0]
print("After loading dataset: Memory RSS/VMS // Mem=%s/%s // %s" % (mem_rss1, mem_vms1, memusage1))
print("After loading dataset: diffs  RSS/VMS // Mem=%s/%s // %s" % (mem_rss1-mem_rss0, mem_vms1-mem_vms0, memusage1-memusage0))
print("Dataset vocabs: ", ds.vocabs)
vtoken = ds.vocabs.get_vocab("token")
print("Token vocab: ", vtoken)

# test our simple approach to loading embeddings. We expect a link or copy of glove.6B.50d.txt.gz in the tests/data
# directory for this
# emb_file = "tests/data/glove.6B.50d.txt.gz"
# emb_file = "tests/data/wiki.en.vec"
emb_file = sys.argv[1]

stoi = vtoken.stoi
Esempio n. 7
0
# Set up logging
logger = logging.getLogger("gatelfdata")
logger.setLevel(logging.ERROR)
logger = logging.getLogger("gatelfkerasjson")
logger.setLevel(logging.DEBUG)
streamhandler = logging.StreamHandler(stream=sys.stderr)
formatter = logging.Formatter(
    '%(asctime)s %(name)-12s %(levelname)-8s %(message)s')
streamhandler.setFormatter(formatter)
logger.addHandler(streamhandler)
filehandler = logging.FileHandler(
    os.path.join(datadir, "FileJsonKerasWrapper.train.log"))
logger.addHandler(filehandler)

# restore the wrapper
ds = Dataset(metafile, targets_need_padding=False)
wrapper = KerasWrapperImpl1(ds)
wrapper.loadModel(modelprefix)

with sys.stdin as infile:
    for line in infile:
        #! print("PYTHON FileJsonKeras APPLICATION, input=",line,file=sys.stderr)
        if line == "STOP":
            break
        # TODO: currently the LF sends individual instances here, we may want to change
        # However we need to always apply to a set of instances, so wrap into another array
        instancedata = json.loads(line)
        # TODO: better error handling: put the apply call into a try block and catch any error, also
        # check returned data. If there is a problem send back in the map we return!!
        # NOTE: the  LF expects to get a map with the following elements:
        # status: must be "ok", anything else is interpreted as an error
from __future__ import print_function
from gatelfdata import Dataset
import sys

if len(sys.argv) != 2:
    raise Exception("Need one parameter: meta file")

file = sys.argv[1]

ds = Dataset(file)
valset = ds.convert_to_file()
for instance in ds.instances_as_data():
    print("Instance: ", instance)
Esempio n. 9
0
from __future__ import print_function
from gatelfdata import Dataset
import sys
import json

if len(sys.argv) != 2:
    raise Exception("Need one parameter: meta file")

file = sys.argv[1]

ds = Dataset(file)

valset = ds.convert_to_file()
it = iter(ds.instances_as_string())
for n in range(20):
    b = []
    print("BATCH: ", n)
    for i in range(2):
        print("INSTANCE: ", i)
        instance = next(it)
        print("Instance: ", instance)
        converted = ds.convert_instance(json.loads(instance))
        print("Converted: ", converted)
        b.append(converted)
    batch1 = ds.reshape_batch(b)
    print("Size2 batch: ", batch1)
    print()
Esempio n. 10
0
def main(sysargs):

    logger.debug("Called with args=%s" % (sysargs,))
    myconfig = ModelWrapperDefault.configsimple(topconfig)

    myconfig.parse_args(args=sysargs[1:])

    if myconfig.get("es_metric") not in ["loss", "accuracy"]:
        raise Exception("es_metric must be loss or accuracy")

    metafile = myconfig.get("metafile")
    modelname = myconfig.get("modelname")

    if not metafile or not modelname:
        raise Exception("Metafile or modelfile not specified, use --help parameter for help\nLearningFramework defaults are: crvd.meta.json FileJsonPyTorch.model")

    datadir = str(Path(metafile).parent)

    if myconfig.get("debug"):
        logger.setLevel(logging.DEBUG)

    es_patience = myconfig.get("es_patience")
    es_mindelta = 0.0

    def es_lambda(losses=None, accs=None, patience=None, mindelta=None, metric="loss"):
        return ModelWrapper.\
            early_stopping_checker(losses, accs, patience=es_patience, mindelta=es_mindelta,
                                   metric=myconfig["es_metric"])

    # Also log to a file
    filehandler = logging.FileHandler(os.path.join(datadir, "pytorch-json.train.log"))
    filehandler.setFormatter(formatter)

    # in order to override the logging level of any of the modules/classes used,
    # get the logger and do it here
    # logger1 = logging.getLogger("gatelfpytorchjson.modelwrapperdefault")
    # logger1.setLevel(logging.DEBUG)

    # TODO: use a static Dataset method to parse the remaining args and create an args
    # dict to pass to the constructors of Dataset and the wrapper so each can pick the
    # parameters relevant to them!

    logger.debug("Running train.py, config is %r" % myconfig)

    if myconfig.get("nocreate"):
        logger.info("--nocreate specified, exiting")
        sys.exit(0)

    logger.debug("Loading metafile...")
    ds = Dataset(metafile, config=myconfig)
    logger.debug("Metafile loaded.")

    # determine and use the correct modelwrapper
    # default is ModelWrapperSimple
    wrapper_class = ModelWrapperDefault
    if myconfig.get("wrapper"):
        wrapperclassname = myconfig["wrapper"]
        import importlib
        module = importlib.import_module("gatelfpytorchjson." + wrapperclassname)
        wrapper_class_ = getattr(module, wrapperclassname)

    # TODO: test passing on parameters
    if myconfig.get("resume"):
        logger.info("--resume specified, loading and continuing on existing model")
        wrapper = wrapper_class.load(modelname, metafile=metafile)
        logger.debug("Modelwrapper loaded")
        logger.debug("Model is %r" % wrapper)
    else:
        logger.debug("Creating ModelWrapperSimple")
        wrapper = wrapper_class(ds, config=myconfig)
        logger.debug("Modelwrapper created")
        logger.debug("Model is %r" % wrapper)

    if myconfig.get("notrain"):
        logger.info("--notrain specified, exiting")
        sys.exit(0)

    if myconfig.get("debug"):
        glf = getattr(wrapper, "get_logger", None)
        if glf and callable(glf):
            wlogger = wrapper.get_logger()
            logger.debug("Setting wrapper logging level to DEBUG")
            wlogger.setLevel(logging.DEBUG)
        else:
            logger.debug("Wrapper has not logging, cannot set to DEBUG")

    # TODO: the default to use for validation set size should be settable through config in the constructor!
    logger.debug("Preparing the data...")
    # if we have a validation file, use it, ignore the valsize
    if myconfig.get("valfile"):
        wrapper.prepare_data(file=myconfig["valfile"])
    else:
        valsize = myconfig.get("valsize")
        if valsize is not None:
            wrapper.prepare_data(validationsize=valsize)
        else:
            wrapper.prepare_data()
    logger.debug("Data prepared")

    wrapper.validate_every_batches = myconfig["valeverybatches"]
    wrapper.validate_every_epochs = myconfig["valeveryepochs"]
    wrapper.validate_every_instances = myconfig["valeveryinstances"]
    wrapper.report_every_instances = myconfig["repeveryinstances"]
    wrapper.report_every_batches = myconfig["repeverybatches"]

    # TODO: figure out what good defaults are here and what we want to set here rather than
    # in the constructor. Maybe allow to set everything in the constructor for simplicity?
    logger.info("Model: %r" % wrapper)
    logger.debug("Start training...")
    wrapper.train(batch_size=myconfig["batchsize"],
                  early_stopping=es_lambda, max_epochs=myconfig["maxepochs"], filenameprefix=modelname
                  )
    logger.debug("Training completed")

    # NOTE: this will save the modelwrapper, and will ONLY save the model if we did not already
    # save the best model during training!
    logger.debug("Saving model...")
    wrapper.save(modelname)
    logger.debug("Model saved")

    # print the model used again so we do not have to scoll back a huge log ...
    logger.info("Model: %r" % wrapper)