parameters_mllib = {} parameters_output = {} dd.put_service(sname,model,description,mllib, parameters_input,parameters_mllib,parameters_output,'unsupervised') # training train_data = [training_repo] parameters_input = {'id':'','separator':',','label':'label'} parameters_mllib = {'iterations':500} parameters_output = {} predout = dd.post_train(sname,train_data,parameters_input,parameters_mllib,parameters_output,async=True) time.sleep(1) train_status = '' while True: train_status = dd.get_train(sname,job=1,timeout=3) if train_status['head']['status'] == 'running': print train_status['body']['measure'] else: print train_status predout = train_status break predictions = predout['body']['predictions'] N = len(predictions) points = np.empty((N,2),dtype=np.float) i = 0 for p in predictions: points[i,0] = p['vals'][0] points[i,1] = p['vals'][1] i = i + 1
# training train_data = [training_repo] parameters_input = {'id': '', 'separator': ',', 'label': 'label'} parameters_mllib = {'iterations': 500} parameters_output = {} predout = dd.post_train(sname, train_data, parameters_input, parameters_mllib, parameters_output, async=True) time.sleep(1) train_status = '' while True: train_status = dd.get_train(sname, job=1, timeout=3) if train_status['head']['status'] == 'running': print train_status['body']['measure'] else: print train_status predout = train_status break predictions = predout['body']['predictions'] N = len(predictions) points = np.empty((N, 2), dtype=np.float) i = 0 for p in predictions: points[i, 0] = p['vals'][0] points[i, 1] = p['vals'][1] i = i + 1
#Start training the service iterations = int(service['iterations']) solver_type = service['solver_type'] base_lr = float(service['base_lr']) parameters_input_training = {'shuffle':True,'test_split':test_split,'min_count':min_count,'min_word_length':min_word_length,'count':False} parameters_mllib_training = {'gpu':True,'solver':{'iterations':iterations,'test_interval':test_interval,'base_lr':base_lr,'solver_type':solver_type},'net':{'batch_size':batch_size}} parameters_output_training = {'measure':['mcll','f1','cmdiag','cmfull']} train_data = [root_repository+'dataset/'] training_service = dd.post_train(service_name.lower(),train_data,parameters_input_training,parameters_mllib_training,parameters_output_training,async=True) job_number = training_service['head']['job'] #Get training data while the service is running sleep(20) status_code = 200 count_job_data = 1 while status_code == 200: job_data = dd.get_train(service_name.lower(),job=job_number, measure_hist=True) status_code = job_data['status']['code'] if not 'accp' in job_data['body']['measure']: sleep(20) continue if job_data['head']['status'] == 'running': log_file.write("job running time "+str(job_data['head']['time'])+"\n") log_file.write("Iteration number "+str(job_data['body']['measure']['iteration'])+"\n") log_file.flush() running_time = job_data['head']['time'] accp = job_data['body']['measure']['accp'] recall = job_data['body']['measure']['recall'] iteration = job_data['body']['measure']['iteration'] precision = job_data['body']['measure']['precision'] mcll = job_data['body']['measure']['mcll'] f1 = job_data['body']['measure']['f1']
class ModelTrainer: """ Prediction Model trainer class binary char-based model training class """ def __init__(self,structure,logger,config): """ Instanciate a model trainer :param dic structure: Model Trainer specific settings eg: {"model-repo":"../models/mymodel","training-repo":"../training/mytraining","sname":"MyTrainer","test_split":0.01,"base-lr":0.01,"clevel":False,"sequence":140,"iterations":50000,"test_interval":1000,"stepsize":15000,"destroy":True,"resume":False,"finetune":False,"weights":"","nclasses":2,"documents":True,"batch-size":128,"test-batch-size":16,"gpuid":0,"mllib":"xgboost","lregression":False} *model-repo* location of the model *training-repo* location of the training files *sname* service name *test_plit* training split between 0 and < 1,type=float,default=0.01 *base_lr* initial learning rate,default=0.01,type=float *clevel* character-level convolutional net,type=boolean *sequence* sequence length for character level models,default=140,type=int *iterations* number of iterations,default=50000,type=int *test_interval* test interval',default=1000,type=int *stepsize* lr policy stepsize',default=15000,type=int *destroy* whether to destroy model',type=boolean *resume* whether to resume training,type=boolean *finetune* whether to finetune,type=boolean *weights* pre-trained weight file, when finetuning *nclasses* number of classes,type=int,default=2 *documents* whether to train from text documents (as opposed to sentences in one doc),type=boolean *batch_size* batch size,type=int,default=128 *test_batch_size* test batch size,type=int,default=16 *gpu* enable gpu usage is True, default=False *gpuid* specify gpu id,type=int,default=0 *mllib* caffe or xgboost,default='caffe' *lregression* whether to use logistic regression,type=boolean :param obj logger: DFM logger object :param obj storage: DFM storage object :param obj config: DFM global config object :returns: ModelTrainer object (instance of a modeltrainer class) """ self.config=config self.structure=structure self.logger=logger self.nclasses = self.structure['nclasses'] self.description = 'classifier' self.sname=self.structure['sname'] self.mllib = self.structure['mllib'] self.dd = DD(config['DEEP_DETECT_URI'],config['DEEP_DETECT_PORT']) self.dd.set_return_format(self.dd.RETURN_PYTHON) def createMLTrainerService(self): """ Create ML Trainer service in DeepDetect """ if self.structure['lregression']: self.template = 'lregression' else: self.template = 'mlp' layers = [800,500,200] if self.structure['clevel']: self.template = 'convnet' self.layers = ['1CR256','1CR256','4CR256','1024','1024'] self.model = {'templates':'../templates/caffe/','repository':self.structure['model-repo']} self.parameters_input = {'connector':'txt','sentences':False,'characters':self.structure['clevel'],'read_forward':True} if self.structure['documents']: self.parameters_input['sentences'] = False if self.structure['clevel']: self.parameters_input['sequence'] = self.sequence #parameters_input['alphabet'] = 'abcdef0123456789' # hex # parameters_input['alphabet'] = '_-,:?/.(){}*%0123456789abcdefghijklmnopqrstuvwxyz' # opcode #parameters_input['alphabet'] = "abcdefghijklmnopqrstuvwxyz0123456789,;.!?'"#\"/\\|_@#$%^&*~`+-=<>" self.parameters_mllib = {'template':self.template,'nclasses':self.nclasses,'db':True,'dropout':0.5} if self.mllib == 'xgboost': self.parameters_mllib['db'] = False if not self.template == 'lregression': self.parameters_mllib['layers'] = layers #parameters_mllib = {'nclasses':nclasses,'db':True} if self.structure['finetune']: self.parameters_mllib['finetuning'] = True if not self.structure['weights']: logger.error('Finetuning requires weights file') # server will fail on service creation anyways else: self.parameters_mllib['weights'] = self.structure['weights'] self.parameters_output = {} self.logger.debug("dd.put_service("+str(self.structure['sname'])+","+str(self.model)+","+str(self.description)+","+str(self.mllib)+","+str(self.parameters_input)+","+str(self.parameters_mllib)+","+str(self.parameters_output)+")") return self.dd.put_service(self.structure['sname'],self.model,self.description,self.mllib,self.parameters_input,self.parameters_mllib,self.parameters_output) def trainModel(self): """ Train the model. """ self.train_data = [self.structure['training-repo']] self.parameters_input = {'test_split':self.structure['test_split'],'shuffle':True,'db':True} if not self.structure['clevel']: self.parameters_input['min_word_length'] = 5 self.parameters_input['min_count'] = 10 self.parameters_input['count'] = False if self.mllib == 'xgboost': self.parameters_input['tfidf'] = True self.parameters_input['db'] = False else: self.parameters_input['sentences'] = True self.parameters_input['characters'] = True self.parameters_input['sequence'] = self.sequence if self.structure['documents']: self.parameters_input['sentences'] = False if self.mllib == 'caffe': self.parameters_input['db']=True self.parameters_mllib = { 'gpu':self.structure['gpu'], 'gpuid':self.structure['gpuid'], 'resume':self.structure['resume'], 'net':{ 'batch_size':self.structure['batch_size'] }, 'solver':{ 'test_interval':self.structure['test_interval'], 'test_initialization':False, 'base_lr':self.structure['base_lr'], 'solver_type':'ADAM', 'iterations':self.structure['iterations'] } }#,'lr_policy':'step','stepsize':self.structure['stepsize'],'gamma':0.5,'weight_decay':0.0001}} elif self.mllib == 'xgboost': self.parameters_mllib = { 'iterations':self.structure['iterations'], 'objective':'multi:softprob', 'booster_params':{'max_depth':50} } self.parameters_output = {'measure':['mcll','f1','cmdiag','cmfull']} if self.nclasses == 2: self.parameters_output['measure'].append('auc') self.logger.debug("dd.post_train("+self.structure['sname']+","+str(self.train_data)+","+str(self.parameters_input)+","+str(self.parameters_mllib)+","+str(self.parameters_output)+",async="+str(True)+")") self.dd.post_train(self.structure['sname'],self.train_data,self.parameters_input,self.parameters_mllib,self.parameters_output,async=True) time.sleep(1) train_status = '' while True: train_status = self.dd.get_train(self.sname,job=1,timeout=10) if train_status['head']['status'] == 'running': self.logger.debug(train_status['body']['measure']) else: self.logger.debug(train_status) break return train_status def clearMLTrainerService(self,clear=''): """ delete the service, keeping the model :param str clear: use clear='lib' to clear the model as well, default empty. :returns: DeepDetect delete result """ return self.dd.delete_service(self.sname,clear=clear)