def train(self, task_id, training_set, testing_set=None, **kwargs): print 'vm trainer called!' # connect to database manager self.db_manager = MongoDBManager(HogSvmTrainer.dbname, ip_addr='128.2.213.107') self.storage_manager = StorageManager('test1') local_tmp_dir = os.path.abspath(HogSvmTrainer.local_tmp_dir) # download images to cache and get all bounding_boxes in memory: obj_image_bounding_box = \ MyUtils.downloadImgWithBx(local_tmp_dir, self.db_manager, self.storage_manager, training_set) # create output dir structure output_dir_prefix = os.path.abspath('./classifier/hog_svm_') # create an output folder for each task if 'id' in kwargs: output_dir = output_dir_prefix + 'id' else: timestamp = time.strftime("%m_%d_%y_%H_%M_%S") output_dir = output_dir_prefix + timestamp MyUtils.create_dir(output_dir) # generate vm specific input files # for each object, create a folder for train.txt, test.txt, and final classifier svm obj_output_dirs = {} for each_obj, obj_info in obj_image_bounding_box.iteritems(): train_obj_info, test_obj_info = self.split_train_file(obj_info, 10) cur_obj_dir = output_dir + '/' + each_obj obj_output_dirs[each_obj] = cur_obj_dir MyUtils.create_dir(cur_obj_dir) self.genInputFileForObj(each_obj, train_obj_info, cur_obj_dir+'/' + 'train_'+each_obj+'.xml') self.genInputFileForObj(each_obj, test_obj_info, cur_obj_dir+'/' + 'test_'+each_obj+'.xml') # split the training file randomly for generating testing file for each_obj, each_obj_dir in obj_output_dirs.iteritems(): print 'calling dlib to train at folder {}'.format(each_obj_dir) detector_name = each_obj + '.svm' detector_path = each_obj_dir + '/' + detector_name accuracy = self.train_with_dlib(each_obj_dir, detector_path, 'train_'+each_obj +'.xml', 'test_'+each_obj+'.xml') # save the classifier into s3 and database accuracy = "Testing accuracy: {}".format(accuracy) MyUtils.zipdir(output_dir) zip_file = output_dir + '.zip' print 'uploading file to s3...' key = self.storage_manager.uploadFile(zip_file) print 'updating database...' # TODO: maybe send this information to controller? # add meta data info to database classifier_metadata={'name': key, 'training_set':training_set, 'testing_set':testing_set, 'accuracy': 'see log', 'filepath': key} self.db_manager.addClassifier(classifier_metadata) return 'succesfully added ' + str(classifier_metadata)
def train(self, task_id, training_set, testing_set, **kwargs): print 'caffe train called' self.output_dir_prefix = os.path.abspath('./classifier/caffenet_') # create an output folder for each task if None != task_id: self.output_dir = self.output_dir_prefix + str(task_id) else: timestamp = time.strftime("%m_%d_%y_%H_%M_%S") self.output_dir = self.output_dir_prefix + timestamp # TODO: changed to user input parameter? max_iter = 500 if 'iteration' in kwargs: max_iter = kwargs['iteration'] MyUtils.create_dir(self.output_dir) # connect to database manager self.db_manager = MongoDBManager(CaffeNetTrainer.dbname, ip_addr='128.2.213.107') self.storage_manager = StorageManager('test1') # make intermediate folder self.ori_image_dir = CaffeNetTrainer.local_tmp_dir + '/' +'original' self.crop_image_dir = CaffeNetTrainer.local_tmp_dir + '/' +'crop' self.cache_manager = cacheManager(self.ori_image_dir) MyUtils.create_dir(self.ori_image_dir) MyUtils.create_dir(self.crop_image_dir) # get image file list and bounding boxes # get objects regardless of their videos, since same object in different video belongs to same identitfy # summarize to get all objects obj_set = {} for obj, vid in training_set.iteritems(): if obj not in obj_set: obj_set[obj] = [] obj_set[obj].extend(vid) print obj_set # for each object, get images training_set_obj_file_path ={} for obj, vids in obj_set.iteritems(): # 1. download image # 2. crop image based on bounding boxes obj_image_path = [] obj_bounding_boxes_with_image = {} for vid in vids: obj_bounding_boxes_each_vid = self.db_manager.getBoundingBoxWithImageByVidAndObj(vid, obj) # TODO: probably can be a bit more efficient. right now is downloading vid repeatedly if it appears # multiple times in train yaml self.cache_manager.cacheImageFiles(vid, dir=False) obj_bounding_boxes_with_image.update(obj_bounding_boxes_each_vid) for image, bx_list in obj_bounding_boxes_with_image.iteritems(): # self.storage_manager.download(image, self.ori_image_dir) idx =0 for bx in bx_list: im = Image.open(self.ori_image_dir+'/'+image) left_x = bx[0] left_y = bx[1] right_x = left_x + bx[2] right_y = left_y + bx[3] output_file_path = self.crop_image_dir + '/' +os.path.splitext(image)[0] + '_' + str(obj) + str(idx) \ + os.path.splitext(image)[1] im.crop((left_x,left_y, right_x, right_y)).save(output_file_path) obj_image_path.append(output_file_path) idx +=1 training_set_obj_file_path[obj] = obj_image_path # generate training file training_set_obj_file_path, testing_set_obj_file_path =self.split_train_file(training_set_obj_file_path, 10) self.generate_caffe_train_file(training_set_obj_file_path, self.output_dir + '/train.txt') self.generate_caffe_train_file(testing_set_obj_file_path, self.output_dir +'/test.txt') # generate label file for corresponds label to object names self.generate_caffe_label_file(training_set_obj_file_path, self.output_dir + '/label.txt') # modify network prototxt num_output_category = len(obj_set) train_file_path = os.path.abspath(self.output_dir + '/train.txt') test_file_path = os.path.abspath(self.output_dir + '/test.txt') output_train_val_path = self.output_dir + '/' + 'train_val_custom.prototxt' output_solver_path = self.output_dir + '/' + 'solver_custom.prototxt' output_deploy_path = self.output_dir + '/' + 'deploy_custom.prototxt' # fine-tuning output self.custom_net_path = self.output_dir + '/train_val_custom.prototxt' self.custom_snapshot_prefix= self.output_dir + '/caffenet_custom' self.output_layer_name ="fc8_custom" self.output_net_name="CaffeNetCustom" # original model self.original_output_layer_name ='fc8' # reference design file locations input_file_prefix = os.path.abspath('./bvlc_reference_caffenet') train_file ='train_val.prototxt' solver_file ='solver.prototxt' deploy_file ='deploy.prototxt' self.mod_caffe_net(input_file_prefix+'/' + train_file, num_output_category, train_file_path, test_file_path, output_train_val_path) self.mod_caffe_solver(input_file_prefix+ '/' +solver_file, max_iter, output_solver_path) self.mod_caffe_deploy(input_file_prefix + '/' + deploy_file, num_output_category, output_deploy_path) cmd ="/opt/caffe/build/tools/caffe train" cmd += " -solver " + output_solver_path cmd += " -weights ./bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel" cmd += " | tee " + self.output_dir + '/' + 'log.txt' print cmd # issue train os.system(cmd) MyUtils.zipdir(self.output_dir) zip_file = self.output_dir + '.zip' print 'uploading file to s3...' key = self.storage_manager.uploadFile(zip_file) print 'updating database...' # TODO: maybe send this information to controller? # add meta data info to database classifier_metadata={'name': key, 'training_set':training_set, 'testing_set':testing_set, 'accuracy': 'see log', 'filepath': key} self.db_manager.addClassifier(classifier_metadata) return