def distributed_train(model, train_method, nodes, fts_method, data, num_batches=10, train_parameters={}, **kwargs): import dispy, dispy.httpd, datetime batch_save = kwargs.get('batch_save', False) # save model between batches batch_save_interval = kwargs.get('batch_save_interval', 1) file_path = kwargs.get('file_path', None) cluster, http_server = start_dispy_cluster(train_method, nodes) print("[{0: %H:%M:%S}] Distrituted Train Started with {1} CPU's" .format(datetime.datetime.now(), get_number_of_cpus(cluster))) jobs = [] n = len(data) batch_size = int(n / num_batches) bcount = 1 for ct in range(model.order, n, batch_size): if model.is_multivariate: ndata = data.iloc[ct - model.order:ct + batch_size] else: ndata = data[ct - model.order: ct + batch_size] tmp_model = fts_method() tmp_model.clone_parameters(model) job = cluster.submit(tmp_model, ndata, train_parameters) job.id = bcount # associate an ID to identify jobs (if needed later) jobs.append(job) bcount += 1 for job in jobs: print("[{0: %H:%M:%S}] Processing batch ".format(datetime.datetime.now()) + str(job.id)) tmp = job() if job.status == dispy.DispyJob.Finished and tmp is not None: model.merge(tmp) if batch_save and (job.id % batch_save_interval) == 0: Util.persist_obj(model, file_path) else: print(job.exception) print(job.stdout) print("[{0: %H:%M:%S}] Finished batch ".format(datetime.datetime.now()) + str(job.id)) print("[{0: %H:%M:%S}] Distrituted Train Finished".format(datetime.datetime.now())) stop_dispy_cluster(cluster, http_server) return model
def train_individual_model(partitioner, train_data, indexer): pttr = str(partitioner.__module__).split('.')[-1] diff = "_diff" if partitioner.transformation is not None else "" _key = "msfts_" + pttr + str( partitioner.partitions) + diff + "_" + indexer.name print(_key) model = cmsfts.ContextualMultiSeasonalFTS(_key, indexer=indexer) model.append_transformation(partitioner.transformation) model.train(train_data, partitioner.sets, order=1) cUtil.persist_obj(model, "models/" + _key + ".pkl") return model
def train(self, data, **kwargs): self.original_max = max(self.indexer.get_data(data)) self.original_min = min(self.indexer.get_data(data)) num_cores = multiprocessing.cpu_count() pool = {} count = 0 for ix in self.indexers: for pt in self.partitioners: pool[count] = {'ix': ix, 'pt': pt} count += 1 results = Parallel(n_jobs=num_cores)(delayed(train_individual_model)( deepcopy(pool[m]['pt']), data, deepcopy(pool[m]['ix'])) for m in pool.keys()) for tmp in results: self.append_model(tmp) cUtil.persist_obj(self, "models/" + self.name + ".pkl")
def fit(self, ndata, **kwargs): """ Fit the model's parameters based on the training data. :param ndata: training time series data :param kwargs: :keyword num_batches: split the training data in num_batches to save memory during the training process :keyword save_model: save final model on disk :keyword batch_save: save the model between each batch :keyword file_path: path to save the model :keyword distributed: boolean, indicate if the training procedure will be distributed in a dispy cluster :keyword nodes: a list with the dispy cluster nodes addresses """ import datetime if self.is_multivariate: data = ndata else: data = self.apply_transformations(ndata) self.original_min = np.nanmin(data) self.original_max = np.nanmax(data) if 'partitioner' in kwargs: self.partitioner = kwargs.pop('partitioner') if not self.is_multivariate and not self.is_wrapper and not self.benchmark_only: if self.partitioner is None: raise Exception( "Fuzzy sets were not provided for the model. Use 'partitioner' parameter. " ) if 'order' in kwargs: self.order = kwargs.pop('order') dump = kwargs.get('dump', None) num_batches = kwargs.get('num_batches', 10) save = kwargs.get('save_model', False) # save model on disk batch_save = kwargs.get('batch_save', False) #save model between batches file_path = kwargs.get('file_path', None) distributed = kwargs.get('distributed', False) batch_save_interval = kwargs.get('batch_save_interval', 10) if distributed is not None and distributed: if distributed == 'dispy': from pyFTS.distributed import dispy nodes = kwargs.get('nodes', False) train_method = kwargs.get('train_method', dispy.simple_model_train) dispy.distributed_train( self, train_method, nodes, type(self), data, num_batches, {}, batch_save=batch_save, file_path=file_path, batch_save_interval=batch_save_interval) elif distributed == 'spark': from pyFTS.distributed import spark url = kwargs.get('url', 'spark://192.168.0.110:7077') app = kwargs.get('app', 'pyFTS') spark.distributed_train(self, data, url=url, app=app) else: if dump == 'time': print("[{0: %H:%M:%S}] Start training".format( datetime.datetime.now())) if num_batches is not None and not self.is_wrapper: n = len(data) batch_size = int(n / num_batches) bcount = 1 rng = range(self.order, n, batch_size) if dump == 'tqdm': from tqdm import tqdm rng = tqdm(rng) for ct in rng: if dump == 'time': print("[{0: %H:%M:%S}] Starting batch ".format( datetime.datetime.now()) + str(bcount)) if self.is_multivariate: mdata = data.iloc[ct - self.order:ct + batch_size] else: mdata = data[ct - self.order:ct + batch_size] self.train(mdata, **kwargs) if batch_save: Util.persist_obj(self, file_path) if dump == 'time': print("[{0: %H:%M:%S}] Finish batch ".format( datetime.datetime.now()) + str(bcount)) bcount += 1 else: self.train(data, **kwargs) if dump == 'time': print("[{0: %H:%M:%S}] Finish training".format( datetime.datetime.now())) if save: Util.persist_obj(self, file_path)
model.fit(train, method='GD', alpha=0.5, momentum=None, iteractions=1) ''' model.fit(train, method='GA', ngen=15, #number of generations mgen=7, # stop after mgen generations without improvement npop=15, # number of individuals on population pcruz=.5, # crossover percentual of population pmut=.3, # mutation percentual of population window_size = 7000, train_rate = .8, increment_rate =.2, experiments=1 ) ''' Util.persist_obj(model, 'fcm_fts10c') ''' model = Util.load_obj('fcm_fts05c') ''' #forecasts = model.predict(test) #print(model) fig, ax = plt.subplots(nrows=1, ncols=1, figsize=[15, 5]) ax.plot(test, label='Original') forecasts = model.predict(test) for w in np.arange(model.order): forecasts.insert(0, None)
sonda_treino = sonda[:1051200] sonda_teste = sonda[1051201:] #res = bchmk.simpleSearch_RMSE(sonda_treino, sonda_teste, # sfts.SeasonalFTS,np.arange(3,30),[1],parameters=1440, # tam=[15,8], plotforecasts=False,elev=45, azim=40, # save=False,file="pictures/sonda_sfts_error_surface", intervals=False) from pyFTS.common import Util from pyFTS.models import cmsfts partitions = ['grid', 'entropy'] indexers = ['m15', 'Mh', 'Mhm15'] for max_part in [40, 50]: for part in partitions: fs = Util.load_obj("models/sonda_fs_" + part + "_" + str(max_part) + ".pkl") for ind in indexers: ix = Util.load_obj("models/sonda_ix_" + ind + ".pkl") model = cmsfts.ContextualMultiSeasonalFTS(part + " " + ind, ix) model.train(sonda_treino, fs) Util.persist_obj( model, "models/sonda_cmsfts_" + part + "_" + str(max_part) + "_" + ind + ".pkl")
def fit(self, ndata, **kwargs): """ :param data: the training time series :param kwargs: :keyword num_batches: split the training data in num_batches to save memory during the training process save_model: save final model on disk batch_save: save the model between each batch file_path: path to save the model distributed: boolean, indicate if the training procedure will be distributed in a dispy cluster nodes: a list with the dispy cluster nodes addresses :return: """ import datetime if self.is_multivariate: data = ndata else: data = self.apply_transformations(ndata) self.original_min = np.nanmin(data) self.original_max = np.nanmax(data) if 'sets' in kwargs: self.sets = kwargs.pop('sets') if 'partitioner' in kwargs: self.partitioner = kwargs.pop('partitioner') if (self.sets is None or len(self.sets) == 0) and not self.benchmark_only: if self.partitioner is not None: self.sets = self.partitioner.sets else: raise Exception("Fuzzy sets were not provided for the model. Use 'sets' parameter or 'partitioner'. ") if 'order' in kwargs: self.order = kwargs.pop('order') dump = kwargs.get('dump', None) num_batches = kwargs.get('num_batches', None) save = kwargs.get('save_model', False) # save model on disk batch_save = kwargs.get('batch_save', False) #save model between batches file_path = kwargs.get('file_path', None) distributed = kwargs.get('distributed', False) batch_save_interval = kwargs.get('batch_save_interval', 10) if distributed: nodes = kwargs.get('nodes', False) train_method = kwargs.get('train_method', Util.simple_model_train) Util.distributed_train(self, train_method, nodes, type(self), data, num_batches, {}, batch_save=batch_save, file_path=file_path, batch_save_interval=batch_save_interval) else: if dump == 'time': print("[{0: %H:%M:%S}] Start training".format(datetime.datetime.now())) if num_batches is not None: n = len(data) batch_size = int(n / num_batches) bcount = 1 rng = range(self.order, n, batch_size) if dump == 'tqdm': from tqdm import tqdm rng = tqdm(rng) for ct in rng: if dump == 'time': print("[{0: %H:%M:%S}] Starting batch ".format(datetime.datetime.now()) + str(bcount)) if self.is_multivariate: mdata = data.iloc[ct - self.order:ct + batch_size] else: mdata = data[ct - self.order : ct + batch_size] self.train(mdata, **kwargs) if batch_save: Util.persist_obj(self,file_path) if dump == 'time': print("[{0: %H:%M:%S}] Finish batch ".format(datetime.datetime.now()) + str(bcount)) bcount += 1 else: self.train(data, **kwargs) if dump == 'time': print("[{0: %H:%M:%S}] Finish training".format(datetime.datetime.now())) if save: Util.persist_obj(self, file_path)
order = 3 nparts = 20 fuzzysets = [] fuzzysets.append(Grid.GridPartitioner(fln_train.glo_avg,nparts)) fuzzysets.append(Grid.GridPartitioner(joi_train.glo_avg,nparts)) fuzzysets.append(Grid.GridPartitioner(sbr_train.glo_avg,nparts)) d = {'fln_glo_avg':fln_train.glo_avg,'sbr_glo_avg':sbr_train.glo_avg,'joi_glo_avg':joi_train.glo_avg} data_train = pd.DataFrame(d) data_train = data_train.dropna(axis=0, how='any') model_file = "models/fts/multivariate/mvhofts-"+str(order)+"-"+str(nparts)+".pkl" mvhofts = mvhofts.MultivariateHighOrderFTS("") mvhofts.train(data_train,fuzzysets,order) cUtil.persist_obj(mvhofts, model_file) obj = cUtil.load_obj(model_file) dt = {'fln_glo_avg':fln_test.glo_avg,'sbr_glo_avg':sbr_test.glo_avg,'joi_glo_avg':joi_test.glo_avg} data_test = pd.DataFrame(dt) data_test = data_test.dropna(axis=0, how='any') ret = obj.forecast(data_test) print("RMSE: " + str(Measures.rmse(list(data_test.fln_glo_avg[order:]), ret[:-1]))) #print(mvhofts)