def train(self, task_id, training_set, testing_set=None, **kwargs):
        print 'vm trainer called!'
        
        # connect to database manager
        self.db_manager = MongoDBManager(HogSvmTrainer.dbname, ip_addr='128.2.213.107')
        self.storage_manager = StorageManager('test1')

        local_tmp_dir = os.path.abspath(HogSvmTrainer.local_tmp_dir)

        # download images to cache and get all bounding_boxes in memory:
        obj_image_bounding_box = \
            MyUtils.downloadImgWithBx(local_tmp_dir, self.db_manager, self.storage_manager, training_set)

        # create output dir structure
        output_dir_prefix = os.path.abspath('./classifier/hog_svm_')
        # create an output folder for each task
        if 'id' in kwargs:
            output_dir = output_dir_prefix + 'id'
        else:
            timestamp = time.strftime("%m_%d_%y_%H_%M_%S")
            output_dir = output_dir_prefix + timestamp
        MyUtils.create_dir(output_dir)

        # generate vm specific input files
        # for each object, create a folder for train.txt, test.txt, and final classifier svm
        obj_output_dirs = {}
        for each_obj, obj_info in obj_image_bounding_box.iteritems():
            train_obj_info, test_obj_info = self.split_train_file(obj_info, 10)
            cur_obj_dir = output_dir + '/' + each_obj
            obj_output_dirs[each_obj] = cur_obj_dir
            MyUtils.create_dir(cur_obj_dir)
            self.genInputFileForObj(each_obj, train_obj_info, cur_obj_dir+'/' + 'train_'+each_obj+'.xml')
            self.genInputFileForObj(each_obj, test_obj_info, cur_obj_dir+'/' + 'test_'+each_obj+'.xml')
            # split the training file randomly for generating testing file

        for each_obj, each_obj_dir in obj_output_dirs.iteritems():
            print 'calling dlib to train at folder {}'.format(each_obj_dir)
            detector_name = each_obj + '.svm'
            detector_path = each_obj_dir + '/' + detector_name
            accuracy = self.train_with_dlib(each_obj_dir, detector_path, 'train_'+each_obj +'.xml', 'test_'+each_obj+'.xml')

            # save the classifier into s3 and database
            accuracy = "Testing accuracy: {}".format(accuracy)

        MyUtils.zipdir(output_dir)
        zip_file = output_dir + '.zip'
        print 'uploading file to s3...'
        key = self.storage_manager.uploadFile(zip_file)
        print 'updating database...'

        # TODO: maybe send this information to controller?
        # add meta data info to database
        classifier_metadata={'name': key,
                             'training_set':training_set,
                             'testing_set':testing_set,
                             'accuracy': 'see log',
                             'filepath': key}
        self.db_manager.addClassifier(classifier_metadata)
        return 'succesfully added ' + str(classifier_metadata)
 def __init__(self, cache_dir=None):
     if None == cache_dir:
         self.cache_folder = os.path.expanduser("~/.oda/cache")
     else:
         self.cache_folder = os.path.expanduser(cache_dir)
     self.server = xmlrpclib.ServerProxy("http://128.2.213.107:8888", allow_none=True)
     self.data_manager = StorageManager("test1")
     create_dir(self.cache_folder)
class Controller(object):

    def __init__(self):
        # vm registries
        self.label_vm   = []
        self.feature_vm = []
        self.classify_vm = []
        self.logger         = MyUtils.getLogger('controller')
        
        #file manager
        self.data_manager = StorageManager('test1')
        self.db_manager = DBManager()
        
        #engines

        # rpc server for now for handling client request
        rpc_paths = ('/RPC2',)
        # Create server
        self.server = SimpleXMLRPCServer(("0.0.0.0", 8888))
        self.server.register_introspection_functions()
        self.server.register_function(self.addImage, 'addImage')
        self.server.serve_forever()


    def decode_image_from_string(self,s):
        return np.fromstring(s, np.uint8)
        
    def addImage(self, image_name, image, override=False):
        # save image to disk first, and then upload to s3
        self.logger.debug('received data. override? {}'.format(override))

        full_path = './tmp/'+image_name
        with open(full_path, 'wb') as f:
            f.write(self.decode_image_from_string(image.data))

        added = False

        if (override or not self.db_manager.hasImage(image_name)):
            self.data_manager.uploadFile(image_name,'./tmp', image_name)
            # file path, is just its image name in s3
            added = self.db_manager.addImage(image_name, override)

#        os.remove(full_path)        
        return added
    def __init__(self):
        # vm registries
        self.label_vm   = []
        self.feature_vm = []
        self.classify_vm = []
        self.logger         = MyUtils.getLogger('controller')
        
        #file manager
        self.data_manager = StorageManager('test1')
        self.db_manager = DBManager()
        
        #engines

        # rpc server for now for handling client request
        rpc_paths = ('/RPC2',)
        # Create server
        self.server = SimpleXMLRPCServer(("0.0.0.0", 8888))
        self.server.register_introspection_functions()
        self.server.register_function(self.addImage, 'addImage')
        self.server.serve_forever()
class cacheManager(object):
    def __init__(self, cache_dir=None):
        if None == cache_dir:
            self.cache_folder = os.path.expanduser("~/.oda/cache")
        else:
            self.cache_folder = os.path.expanduser(cache_dir)
        self.server = xmlrpclib.ServerProxy("http://128.2.213.107:8888", allow_none=True)
        self.data_manager = StorageManager("test1")
        create_dir(self.cache_folder)

    # dir means whether to create a new folder with video name
    def cacheImageFiles(self, video_name, frame_indices=None, dir=True):
        img_cache_list, frm_range_string = self.server.getImageListByVid(video_name, frame_indices)
        if dir:
            vid_cache_dir = self.cache_folder + "/" + video_name
        else:
            vid_cache_dir = self.cache_folder
        create_dir(vid_cache_dir)
        # download image one by one
        map(self.downloadFileIfNotExist, img_cache_list, [vid_cache_dir] * len(img_cache_list))
        return vid_cache_dir, img_cache_list, frm_range_string

    def cacheImageFromFileList(self, img_cache_list, vid_cache_dir):
        create_dir(vid_cache_dir)
        # download image one by one
        map(self.downloadFileIfNotExist, img_cache_list, [vid_cache_dir] * len(img_cache_list))
        return vid_cache_dir, img_cache_list

    def downloadFileIfNotExist(self, file_name, to_directory):
        if self.dirHasFile(to_directory, file_name):
            return False
        else:
            return self.data_manager.download(file_name, to_directory)

    def dirHasFile(self, directory, file_name):
        directory = os.path.expanduser(directory)
        file_path = os.path.join(directory, file_name)
        if os.path.isfile(file_path):
            return True
        return False
class HogSvmTrainer():
    local_tmp_dir = './tmp/s3'    
    dbname = 'test1'
    
    def create_dir(self, dir_name):
        if not os.path.exists(dir_name):
            os.makedirs(dir_name)

    def download_files(self, img_file_list):
        local_tmp_dir = HogSvmTrainer.local_tmp_dir
        self.create_dir(local_tmp_dir)
        local_file_list = []
        for each_remote_file in img_file_list:
            self.storage_manager.download(each_remote_file, local_tmp_dir)
            local_file_list.append(local_tmp_dir+'/'+ each_remote_file)
        return local_file_list

    def upload_file(self, file_name, local_dir, key):
        return self.storage_manager.uploadFile(file_name, local_dir, key)
    
    def save_xml(self, fname, img_file_name_list, bounding_box_list):
        # create XML
        # training data
        root = etree.Element('dataset')
        images = etree.SubElement(root, "images")
        for (idx, file_name) in enumerate(img_file_name_list):
            image = etree.SubElement(images, "image")
            image_attr = image.attrib
            image_attr['file']= file_name
            box = etree.SubElement(image, "box")
            box_attr = box.attrib
            cur_bounding_box = bounding_box_list[idx]
            box_attr['top']    =str(cur_bounding_box[0])
            box_attr['left']   =str(cur_bounding_box[1])
            box_attr['width']  =str(cur_bounding_box[2])
            box_attr['height'] =str(cur_bounding_box[3])

        print fname
        with open(fname,'w+') as outfile:
            outfile.write(etree.tostring(root, pretty_print=True))

    # create xml file for a image file with multiple bounding boxes
    # take input of form: filename - file; bx - a list of bounding boxes
    def genxmlForFile(self, etree_parent, filename, bx):
        image = etree.SubElement(etree_parent, "image")
        image_attr = image.attrib
        image_attr['file']= filename
        for each_bx in bx:
            box = etree.SubElement(image, "box")
            box_attr = box.attrib
            cur_bounding_box = each_bx
            # note: the first in is left, second is top
            box_attr['top']    =str(cur_bounding_box[1])
            box_attr['left']   =str(cur_bounding_box[0])
            box_attr['width']  =str(cur_bounding_box[2])
            box_attr['height'] =str(cur_bounding_box[3])
        return etree_parent

    def split_train_file(self, obj_info, ratio):
        train_obj_info={}
        test_obj_info={}
        if (len(obj_info)<2):
            train_obj_info = obj_info
        else:
            idx = 0
            for file_name, bx in obj_info.iteritems():
                if (0 != idx % ratio):
                    train_obj_info[file_name] =bx
                else:
                    test_obj_info[file_name] =bx
                idx +=1
        return train_obj_info, test_obj_info

    # generate input training file for a particular object
    # obj_info accept a format of :
    # {file1:[bx1, bx2], file2:[bx1, bx2]}
    def genInputFileForObj(self, obj_name, obj_info, output_xml_file_name):
        # create XML
        # training data
        root = etree.Element('dataset')
        images = etree.SubElement(root, "images")
        for filename, bx in obj_info.iteritems():
            images = self.genxmlForFile(images, filename, bx)

        print 'generated xml file: ' + output_xml_file_name
        with open(output_xml_file_name,'w+') as outfile:
            outfile.write(etree.tostring(root, pretty_print=True))

    # download all images needed from server
    # create output directories for different objects
    # launch multiple threads running dlib for different objects
    # zip all final svm files into a zip file
    def train(self, task_id, training_set, testing_set=None, **kwargs):
        print 'vm trainer called!'
        
        # connect to database manager
        self.db_manager = MongoDBManager(HogSvmTrainer.dbname, ip_addr='128.2.213.107')
        self.storage_manager = StorageManager('test1')

        local_tmp_dir = os.path.abspath(HogSvmTrainer.local_tmp_dir)

        # download images to cache and get all bounding_boxes in memory:
        obj_image_bounding_box = \
            MyUtils.downloadImgWithBx(local_tmp_dir, self.db_manager, self.storage_manager, training_set)

        # create output dir structure
        output_dir_prefix = os.path.abspath('./classifier/hog_svm_')
        # create an output folder for each task
        if 'id' in kwargs:
            output_dir = output_dir_prefix + 'id'
        else:
            timestamp = time.strftime("%m_%d_%y_%H_%M_%S")
            output_dir = output_dir_prefix + timestamp
        MyUtils.create_dir(output_dir)

        # generate vm specific input files
        # for each object, create a folder for train.txt, test.txt, and final classifier svm
        obj_output_dirs = {}
        for each_obj, obj_info in obj_image_bounding_box.iteritems():
            train_obj_info, test_obj_info = self.split_train_file(obj_info, 10)
            cur_obj_dir = output_dir + '/' + each_obj
            obj_output_dirs[each_obj] = cur_obj_dir
            MyUtils.create_dir(cur_obj_dir)
            self.genInputFileForObj(each_obj, train_obj_info, cur_obj_dir+'/' + 'train_'+each_obj+'.xml')
            self.genInputFileForObj(each_obj, test_obj_info, cur_obj_dir+'/' + 'test_'+each_obj+'.xml')
            # split the training file randomly for generating testing file

        for each_obj, each_obj_dir in obj_output_dirs.iteritems():
            print 'calling dlib to train at folder {}'.format(each_obj_dir)
            detector_name = each_obj + '.svm'
            detector_path = each_obj_dir + '/' + detector_name
            accuracy = self.train_with_dlib(each_obj_dir, detector_path, 'train_'+each_obj +'.xml', 'test_'+each_obj+'.xml')

            # save the classifier into s3 and database
            accuracy = "Testing accuracy: {}".format(accuracy)

        MyUtils.zipdir(output_dir)
        zip_file = output_dir + '.zip'
        print 'uploading file to s3...'
        key = self.storage_manager.uploadFile(zip_file)
        print 'updating database...'

        # TODO: maybe send this information to controller?
        # add meta data info to database
        classifier_metadata={'name': key,
                             'training_set':training_set,
                             'testing_set':testing_set,
                             'accuracy': 'see log',
                             'filepath': key}
        self.db_manager.addClassifier(classifier_metadata)
        return 'succesfully added ' + str(classifier_metadata)
        # self.upload_file(detector_name,HogSvmTrainer.local_tmp_dir , detector_name)
        # # add meta data info to database
        # classifier_metadata={'name': detector_name,
        #                      'training_set':training_set,
        #                      'testing_set':testing_set,
        #                      'accuracy': accuracy,
        #                      'filepath':detector_name}
        # self.db_manager.addClassifier(classifier_metadata)



    # input: a file path to object folder
    # such object folder should contain
    # two files: training.xml and testing.xml
    def train_with_dlib(self, obj_folder, detector_path, train_file_name, test_file_name):

        pdb.set_trace()
        logger = MyUtils.Tee("{0}/{1}.log".format(obj_folder, 'run'), 'w')
        logger.write('tee: dlib training called')

        logger.write('dlib training called')
        
        # Now let's do the training.  The train_simple_object_detector() function has a
        # bunch of options, all of which come with reasonable default values.  The next
        # few lines goes over some of these options.
        options = dlib.simple_object_detector_training_options()

        # The trainer is a kind of support vector machine and therefore has the usual
        # SVM C parameter.  In general, a bigger C encourages it to fit the training
        # data better but might lead to overfitting.  You must find the best C value
        # empirically by checking how well the trained detector works on a test set of
        # images you haven't trained on.  Don't just leave the value set at 5.  Try a
        # few different C values and see what works best for your data.
        options.C = 5

        # Tell the code how many CPU cores your computer has for the fastest training.
        options.num_threads = 2
        options.be_verbose = True

        training_xml_path = os.path.join(obj_folder, train_file_name)
        testing_xml_path = os.path.join(obj_folder, test_file_name)
        # This function does the actual training.  It will save the final detector to
        # detector.svm.  The input is an XML file that lists the images in the training
        # dataset and also contains the positions of the face boxes.  To create your
        # own XML files you can use the imglab tool which can be found in the
        # tools/imglab folder.  It is a simple graphical tool for labeling objects in
        # images with boxes.  To see how to use it read the tools/imglab/README.txt
        # file.  But for this example, we just use the training.xml file included with
        # dlib.
        logger.write('start training. saved detector path: ' + detector_path)
        dlib.train_simple_object_detector(training_xml_path, detector_path, options)
        logger.write( 'end training')


        # Now that we have a face detector we can test it.  The first statement tests
        # it on the training data.  It will logger.write((the precision, recall, and then)
        # average precision.
        logger.write("")  # Print blank line to create gap from previous output
        logger.write("Training accuracy: {}".format(
            dlib.test_simple_object_detector(training_xml_path, detector_path)))
        # However, to get an idea if it really worked without overfitting we need to
        # run it on images it wasn't trained on.  The next line does this.  Happily, we
        # see that the object detector works perfectly on the testing images.
        accuracy = dlib.test_simple_object_detector(testing_xml_path, "detector.svm")
        logger.write("Testing accuracy: {}".format(accuracy))
        logger.flush()
        return accuracy
class CaffeNetTrainer(object):
    local_tmp_dir = os.path.abspath('./tmp/s3')
    dbname = 'test1'

    def mod_caffe_net(self, ori_net_file_path, num_output_category, train_file_path, test_file_path,
                      output_file_path):
        net_param = caffe_pb2.NetParameter()
        with open(ori_net_file_path, 'r') as pt_fd:
            Merge(pt_fd.read(), net_param)

        net_param.name = self.output_net_name
        for layer in net_param.layer:
            #    print layer.name
            # modify data layer
            if "data" == layer.name:
                print 'found layer data'
                print layer.type
                layer.type = 'ImageData'
                print layer.image_data_param
                layer.image_data_param.new_height = 256
                layer.image_data_param.new_width = 256
                layer.image_data_param.batch_size = 20

                layer.ClearField("data_param")

                # change to user input
                if (layer.include[0].phase == caffe_pb2.TRAIN):
                    layer.image_data_param.source = train_file_path
                elif (layer.include[0].phase == caffe_pb2.TEST):
                    layer.image_data_param.source = test_file_path

            if self.original_output_layer_name == layer.name:
                print 'found fc8'
                layer.name = self.output_layer_name

                for top in layer.top:
                    idx = 0
                    if (self.original_output_layer_name == top):
                        layer.top[idx] = self.output_layer_name
                    idx +=1

                for param in layer.param:
                    param.lr_mult = param.lr_mult * 10
                layer.inner_product_param.num_output = num_output_category

            if "accuracy" == layer.name:
                print 'modifying layer loss'
                for bottom in layer.bottom:
                    idx =0
                    if self.original_output_layer_name == bottom:
                        layer.bottom[idx]= self.output_layer_name
                    idx += 1

            if "loss" == layer.name:
                print 'modifying layer loss'
                for bottom in layer.bottom:
                    idx =0
                    if self.original_output_layer_name == bottom:
                        layer.bottom[idx]= self.output_layer_name
                    idx += 1

        net_string = MessageToString(net_param)
        output = open(output_file_path, 'w')
        output.write(net_string)


    def mod_caffe_solver(self, ori_solver_file_path, num_max_iteration, output_file_path):
        solver_param = caffe_pb2.SolverParameter()
        with open(ori_solver_file_path, 'r') as pt_fd:
            Merge(pt_fd.read(), solver_param)
        print solver_param

        solver_param.net = self.custom_net_path
        solver_param.test_iter[0] = 100
        solver_param.test_interval = 100
        solver_param.base_lr = 0.001
        solver_param.stepsize = 400
        solver_param.max_iter = num_max_iteration
        solver_param.snapshot = 100
        solver_param.snapshot_prefix = self.custom_snapshot_prefix
        solver_param.solver_mode = solver_param.CPU

        solver_string = MessageToString(solver_param)
        output = open(output_file_path, 'w')
        output.write(solver_string)

    def mod_caffe_deploy(self, ori_deploy_file_path, num_output_category, output_file_path):
        net_param = caffe_pb2.NetParameter()
        with open(ori_deploy_file_path, 'r') as pt_fd:
            Merge(pt_fd.read(), net_param)

        net_param.name = self.output_net_name
        for layer in net_param.layer:
            if self.original_output_layer_name == layer.name:
                print 'modifying layer fc8'
                layer.name = self.output_layer_name

                for top in layer.top:
                    idx = 0
                    if (self.original_output_layer_name == top):
                        layer.top[idx] = self.output_layer_name
                    idx +=1

                layer.inner_product_param.num_output = num_output_category

            if 'prob' == layer.name:
                for bottom in layer.bottom:
                    idx =0
                    if self.original_output_layer_name == bottom:
                        layer.bottom[idx]=self.output_layer_name
                    idx += 1

        net_string = MessageToString(net_param)
        output = open(output_file_path, 'w')
        output.write(net_string)

    # train
    def train(self, task_id, training_set, testing_set, **kwargs):
        print 'caffe train called'

        self.output_dir_prefix = os.path.abspath('./classifier/caffenet_')
        # create an output folder for each task
        if None != task_id:
            self.output_dir = self.output_dir_prefix + str(task_id)
        else:
            timestamp = time.strftime("%m_%d_%y_%H_%M_%S")
            self.output_dir = self.output_dir_prefix + timestamp

        # TODO: changed to user input parameter?
        max_iter = 500
        if 'iteration' in kwargs:
            max_iter = kwargs['iteration']

        MyUtils.create_dir(self.output_dir)


        # connect to database manager
        self.db_manager = MongoDBManager(CaffeNetTrainer.dbname, ip_addr='128.2.213.107')
        self.storage_manager = StorageManager('test1')

        # make intermediate folder
        self.ori_image_dir = CaffeNetTrainer.local_tmp_dir + '/' +'original'
        self.crop_image_dir = CaffeNetTrainer.local_tmp_dir + '/' +'crop'
        self.cache_manager = cacheManager(self.ori_image_dir)
        MyUtils.create_dir(self.ori_image_dir)
        MyUtils.create_dir(self.crop_image_dir)
        # get image file list and bounding boxes

        # get objects regardless of their videos, since same object in different video belongs to same identitfy
        # summarize to get all objects
        obj_set = {}
        for obj, vid in training_set.iteritems():
            if obj not in obj_set:
                obj_set[obj] = []
            obj_set[obj].extend(vid)
        print obj_set

        # for each object, get images
        training_set_obj_file_path ={}
        for obj, vids in obj_set.iteritems():
            # 1. download image
            # 2. crop image based on bounding boxes
            obj_image_path = []
            obj_bounding_boxes_with_image = {}
            for vid in vids:
                obj_bounding_boxes_each_vid = self.db_manager.getBoundingBoxWithImageByVidAndObj(vid, obj)
                # TODO: probably can be a bit more efficient. right now is downloading vid repeatedly if it appears
                # multiple times in train yaml
                self.cache_manager.cacheImageFiles(vid, dir=False)
                obj_bounding_boxes_with_image.update(obj_bounding_boxes_each_vid)

            for image, bx_list in obj_bounding_boxes_with_image.iteritems():
#                self.storage_manager.download(image, self.ori_image_dir)
                idx =0
                for bx in bx_list:
                    im = Image.open(self.ori_image_dir+'/'+image)
                    left_x = bx[0]
                    left_y = bx[1]
                    right_x = left_x + bx[2]
                    right_y = left_y + bx[3]
                    output_file_path = self.crop_image_dir + '/' +os.path.splitext(image)[0] + '_' + str(obj) + str(idx) \
                                       + os.path.splitext(image)[1]
                    im.crop((left_x,left_y, right_x, right_y)).save(output_file_path)
                    obj_image_path.append(output_file_path)
                    idx +=1
            training_set_obj_file_path[obj] = obj_image_path

        # generate training file
        training_set_obj_file_path, testing_set_obj_file_path =self.split_train_file(training_set_obj_file_path, 10)
        self.generate_caffe_train_file(training_set_obj_file_path, self.output_dir + '/train.txt')
        self.generate_caffe_train_file(testing_set_obj_file_path, self.output_dir +'/test.txt')
        # generate label file for corresponds label to object names
        self.generate_caffe_label_file(training_set_obj_file_path, self.output_dir + '/label.txt')

        # modify network prototxt
        num_output_category = len(obj_set)

        train_file_path = os.path.abspath(self.output_dir + '/train.txt')
        test_file_path = os.path.abspath(self.output_dir + '/test.txt')

        output_train_val_path = self.output_dir + '/' + 'train_val_custom.prototxt'
        output_solver_path = self.output_dir + '/' + 'solver_custom.prototxt'
        output_deploy_path = self.output_dir + '/' + 'deploy_custom.prototxt'
        # fine-tuning output
        self.custom_net_path = self.output_dir + '/train_val_custom.prototxt'
        self.custom_snapshot_prefix= self.output_dir + '/caffenet_custom'
        self.output_layer_name ="fc8_custom"
        self.output_net_name="CaffeNetCustom"
        # original model
        self.original_output_layer_name ='fc8'
        # reference design file locations
        input_file_prefix = os.path.abspath('./bvlc_reference_caffenet')
        train_file ='train_val.prototxt'
        solver_file ='solver.prototxt'
        deploy_file ='deploy.prototxt'

        self.mod_caffe_net(input_file_prefix+'/' + train_file, num_output_category, train_file_path,
                           test_file_path, output_train_val_path)
        self.mod_caffe_solver(input_file_prefix+ '/' +solver_file,
                              max_iter, output_solver_path)
        self.mod_caffe_deploy(input_file_prefix + '/' + deploy_file,
                              num_output_category, output_deploy_path)

        cmd ="/opt/caffe/build/tools/caffe train"
        cmd += " -solver " + output_solver_path
        cmd += " -weights ./bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel"
        cmd += " | tee " + self.output_dir + '/' + 'log.txt'

        print cmd
        # issue train
        os.system(cmd)
        MyUtils.zipdir(self.output_dir)
        zip_file = self.output_dir + '.zip'
        print 'uploading file to s3...'
        key = self.storage_manager.uploadFile(zip_file)
        print 'updating database...'

        # TODO: maybe send this information to controller?
        # add meta data info to database
        classifier_metadata={'name': key,
                             'training_set':training_set,
                             'testing_set':testing_set,
                             'accuracy': 'see log',
                             'filepath': key}
        self.db_manager.addClassifier(classifier_metadata)
        return


    def split_train_file(self, objs_file_path, ratio):
        train_files={}
        test_files={}
        for obj, obj_files in objs_file_path.iteritems():
            if ( len(obj_files) <2):
                train_files[obj] = obj_files
            else:
                test_file_num = int(len(obj_files)/ratio)
                train_files[obj] = obj_files[test_file_num:]
                test_files[obj] = obj_files[:test_file_num]
        return train_files, test_files


    # input should be of format { obj1: [obj1_f1, obj1_f2] , obj2: [obj2_f1, obj2_f2] }
    def generate_caffe_train_file(self, objs_file_path, filename):
        output_list = []
        idx = 0
        for obj, obj_files in objs_file_path.iteritems():
            for each_file in obj_files:
                output_list.append(each_file+" " + str(idx))
            idx += 1
        random.shuffle(output_list)

        with open(filename, 'w') as train_output:
            for line in output_list:
                train_output.write(line+'\n')

    def generate_caffe_test_file(self, objs_file_path, filename):
        with open(filename, 'w') as train_output:
            idx = 0
            for obj, obj_files in objs_file_path.iteritems():
                for each_file in obj_files:
                    train_output.write(each_file+'\n')
                idx += 1

    def generate_caffe_label_file(self, objs_file_path, filename):
        with open(filename, 'w') as label_output:
            idx = 0
            for obj, obj_files in objs_file_path.iteritems():
                label_output.write(obj + " " + str(idx)+'\n')
                idx += 1
    def train(self, task_id, training_set, testing_set, **kwargs):
        print 'caffe train called'

        self.output_dir_prefix = os.path.abspath('./classifier/caffenet_')
        # create an output folder for each task
        if None != task_id:
            self.output_dir = self.output_dir_prefix + str(task_id)
        else:
            timestamp = time.strftime("%m_%d_%y_%H_%M_%S")
            self.output_dir = self.output_dir_prefix + timestamp

        # TODO: changed to user input parameter?
        max_iter = 500
        if 'iteration' in kwargs:
            max_iter = kwargs['iteration']

        MyUtils.create_dir(self.output_dir)


        # connect to database manager
        self.db_manager = MongoDBManager(CaffeNetTrainer.dbname, ip_addr='128.2.213.107')
        self.storage_manager = StorageManager('test1')

        # make intermediate folder
        self.ori_image_dir = CaffeNetTrainer.local_tmp_dir + '/' +'original'
        self.crop_image_dir = CaffeNetTrainer.local_tmp_dir + '/' +'crop'
        self.cache_manager = cacheManager(self.ori_image_dir)
        MyUtils.create_dir(self.ori_image_dir)
        MyUtils.create_dir(self.crop_image_dir)
        # get image file list and bounding boxes

        # get objects regardless of their videos, since same object in different video belongs to same identitfy
        # summarize to get all objects
        obj_set = {}
        for obj, vid in training_set.iteritems():
            if obj not in obj_set:
                obj_set[obj] = []
            obj_set[obj].extend(vid)
        print obj_set

        # for each object, get images
        training_set_obj_file_path ={}
        for obj, vids in obj_set.iteritems():
            # 1. download image
            # 2. crop image based on bounding boxes
            obj_image_path = []
            obj_bounding_boxes_with_image = {}
            for vid in vids:
                obj_bounding_boxes_each_vid = self.db_manager.getBoundingBoxWithImageByVidAndObj(vid, obj)
                # TODO: probably can be a bit more efficient. right now is downloading vid repeatedly if it appears
                # multiple times in train yaml
                self.cache_manager.cacheImageFiles(vid, dir=False)
                obj_bounding_boxes_with_image.update(obj_bounding_boxes_each_vid)

            for image, bx_list in obj_bounding_boxes_with_image.iteritems():
#                self.storage_manager.download(image, self.ori_image_dir)
                idx =0
                for bx in bx_list:
                    im = Image.open(self.ori_image_dir+'/'+image)
                    left_x = bx[0]
                    left_y = bx[1]
                    right_x = left_x + bx[2]
                    right_y = left_y + bx[3]
                    output_file_path = self.crop_image_dir + '/' +os.path.splitext(image)[0] + '_' + str(obj) + str(idx) \
                                       + os.path.splitext(image)[1]
                    im.crop((left_x,left_y, right_x, right_y)).save(output_file_path)
                    obj_image_path.append(output_file_path)
                    idx +=1
            training_set_obj_file_path[obj] = obj_image_path

        # generate training file
        training_set_obj_file_path, testing_set_obj_file_path =self.split_train_file(training_set_obj_file_path, 10)
        self.generate_caffe_train_file(training_set_obj_file_path, self.output_dir + '/train.txt')
        self.generate_caffe_train_file(testing_set_obj_file_path, self.output_dir +'/test.txt')
        # generate label file for corresponds label to object names
        self.generate_caffe_label_file(training_set_obj_file_path, self.output_dir + '/label.txt')

        # modify network prototxt
        num_output_category = len(obj_set)

        train_file_path = os.path.abspath(self.output_dir + '/train.txt')
        test_file_path = os.path.abspath(self.output_dir + '/test.txt')

        output_train_val_path = self.output_dir + '/' + 'train_val_custom.prototxt'
        output_solver_path = self.output_dir + '/' + 'solver_custom.prototxt'
        output_deploy_path = self.output_dir + '/' + 'deploy_custom.prototxt'
        # fine-tuning output
        self.custom_net_path = self.output_dir + '/train_val_custom.prototxt'
        self.custom_snapshot_prefix= self.output_dir + '/caffenet_custom'
        self.output_layer_name ="fc8_custom"
        self.output_net_name="CaffeNetCustom"
        # original model
        self.original_output_layer_name ='fc8'
        # reference design file locations
        input_file_prefix = os.path.abspath('./bvlc_reference_caffenet')
        train_file ='train_val.prototxt'
        solver_file ='solver.prototxt'
        deploy_file ='deploy.prototxt'

        self.mod_caffe_net(input_file_prefix+'/' + train_file, num_output_category, train_file_path,
                           test_file_path, output_train_val_path)
        self.mod_caffe_solver(input_file_prefix+ '/' +solver_file,
                              max_iter, output_solver_path)
        self.mod_caffe_deploy(input_file_prefix + '/' + deploy_file,
                              num_output_category, output_deploy_path)

        cmd ="/opt/caffe/build/tools/caffe train"
        cmd += " -solver " + output_solver_path
        cmd += " -weights ./bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel"
        cmd += " | tee " + self.output_dir + '/' + 'log.txt'

        print cmd
        # issue train
        os.system(cmd)
        MyUtils.zipdir(self.output_dir)
        zip_file = self.output_dir + '.zip'
        print 'uploading file to s3...'
        key = self.storage_manager.uploadFile(zip_file)
        print 'updating database...'

        # TODO: maybe send this information to controller?
        # add meta data info to database
        classifier_metadata={'name': key,
                             'training_set':training_set,
                             'testing_set':testing_set,
                             'accuracy': 'see log',
                             'filepath': key}
        self.db_manager.addClassifier(classifier_metadata)
        return