def __init__(self, expFileName, sysPath, messengerHost, messengerPort, messengerUser, messengerPassword, sync, aggregator, learnerFactory, dataSourceFactory, stoppingCriterion, initHandler=InitializationHandler()): self._uniqueId = str(os.getpid()) self.expFileName = expFileName self.sysPath = sysPath self.messengerHost = messengerHost self.messengerPort = messengerPort self.messengerUser = messengerUser self.messengerPassword = messengerPassword self.sync = sync self.aggregator = aggregator self.learnerFactory = learnerFactory self.dataSourceFactory = dataSourceFactory self.dataScheduler = IntervalDataScheduler() self.initHandler = initHandler self.stoppingCriterion = stoppingCriterion
def __init__(self, executionMode, messengerHost, messengerPort, numberOfNodes, sync, aggregator, learnerFactory, dataSourceFactory, stoppingCriterion, initHandler=InitializationHandler(), dataScheduler=IntervalDataScheduler, minStartNodes=0, minStopNodes=0, sleepTime=5): self.executionMode = executionMode if executionMode == 'cpu': self.devices = None self.modelsPer = None else: self.devices = [] if os.environ.get('CUDA_VISIBLE_DEVICES') is None: gpuIds = range( str(subprocess.check_output(["nvidia-smi", "-L"])).count('UUID')) else: gpuIds = os.environ.get('CUDA_VISIBLE_DEVICES').split(',') for taskid in gpuIds: self.devices.append('cuda:' + str(taskid)) self.modelsPer = math.ceil(numberOfNodes * 1.0 / len(self.devices)) print(self.modelsPer, "models per gpu on", ','.join(self.devices)) self.messengerHost = messengerHost self.messengerPort = messengerPort self.numberOfNodes = numberOfNodes self.sync = sync self.aggregator = aggregator self.learnerFactory = learnerFactory self.dataSourceFactory = dataSourceFactory self.stoppingCriterion = stoppingCriterion self.initHandler = initHandler self.dataScheduler = dataScheduler self._uniqueId = str(os.getpid()) self.sleepTime = sleepTime self.minStartNodes = minStartNodes self.minStopNodes = minStopNodes
aggregator = Average() stoppingCriterion = MaxAmountExamples(2800) dsFactory = FileDataSourceFactory( filename="../../../../data/textualMNIST/mnist_train.txt", decoder=MNISTDecoder(), numberOfNodes=numberOfNodes, indices='roundRobin', shuffle=False, cache=False) learnerFactory = PytorchLearnerFactory(network=DropoutNet(), updateRule=updateRule, learningRate=learningRate, learningParams=learningParams, lossFunction=lossFunction, batchSize=batchSize, syncPeriod=syncPeriod) initHandler = InitializationHandler() exp = Experiment(executionMode=executionMode, messengerHost=messengerHost, messengerPort=messengerPort, numberOfNodes=numberOfNodes, sync=sync, aggregator=aggregator, learnerFactory=learnerFactory, dataSourceFactory=dsFactory, stoppingCriterion=stoppingCriterion, initHandler=initHandler) exp.run("MNISTtorchCNNwithDropOut")