Ejemplo n.º 1
0
 def __init__(self, root_folder, extensions, prefetch = False, 
              target_size = None, max_size = None, min_size = None,
              center_crop = None):
     """ Initialize from a two-layer storage
     Input:
         root_folder: the root that contains the data. Under root_folder
             there should be a list of folders, under which there should be
             a list of files
         extensions: the list of extensions that should be used to filter the
             files. Should be like ['png', 'jpg']. It's case insensitive.
         prefetch: if True, the images are prefetched to avoid disk read. If
             you have a large number of images, prefetch would require a lot
             of memory.
         target_size, max_size, min_size, center_crop: see manipulate() for
             details.
     """
     super(TwoLayerDataset, self).__init__()
     if mpi.agree(not os.path.exists(root_folder)):
         raise OSError, "The specified folder does not exist."
     logging.debug('Loading from %s' % (root_folder,))
     if type(extensions) is str:
         extensions = [extensions]
     extensions = set(extensions)
     if mpi.is_root():
         # get files first
         files = glob.glob(os.path.join(root_folder, '*', '*'))
         # select those that fits the extension
         files = [f for f in files  if any([
                         f.lower().endswith(ext) for ext in extensions])]
         logging.debug("A total of %d images." % (len(files)))
         # get raw labels
         labels = [os.path.split(os.path.split(f)[0])[1] for f in files]
         classnames = list(set(labels))
         # sort so we get a reasonable class order
         classnames.sort()
         name2val = dict(zip(classnames, range(len(classnames))))
         labels = [name2val[label] for label in labels]
     else:
         files = None
         classnames = None
         labels = None
     mpi.barrier()
     self._rawdata = mpi.distribute_list(files)
     self._data = self._rawdata
     self._prefetch = prefetch
     self._target_size = target_size
     self._max_size = max_size
     self._min_size = min_size
     self._center_crop = center_crop
     if target_size != None:
         self._dim = tuple(target_size) + (3,)
     else:
         self._dim = False
     self._channels = 3
     if prefetch:
         self._data = [self._read(idx) for idx in range(len(self._data))]
     self._label = mpi.distribute_list(labels)
     self._classnames = mpi.COMM.bcast(classnames)
Ejemplo n.º 2
0
 def testDistributeList(self):
     lengths = range(1, 5)
     for length in lengths:
         source = range(length) * mpi.SIZE
         result = mpi.distribute_list(source)
         self.assertEqual(len(result), length)
         for i in range(length):
             self.assertEqual(result[i], i)
Ejemplo n.º 3
0
 def testDistributeList(self):
     lengths = range(1, 5)
     for length in lengths:
         source = range(length) * mpi.SIZE
         result = mpi.distribute_list(source)
         self.assertEqual(len(result), length)
         for i in range(length):
             self.assertEqual(result[i], i)
Ejemplo n.º 4
0
 def __init__(self,
              root,
              is_training,
              crop=False,
              prefetch=False,
              target_size=None):
     """Load the dataset.
     Input:
         root: the root folder of the CUB_200_2011 dataset.
         is_training: if true, load the training data. Otherwise, load the
             testing data.
         crop: if False, does not crop the bounding box. If a real value,
             crop is the ratio of the bounding box that gets cropped.
             e.g., if crop = 1.5, the resulting image will be 1.5 * the
             bounding box area.
         prefetch: if True, the images are prefetched to avoid disk read. If
             you have a large number of images, prefetch would require a lot
             of memory.
         target_size: if provided, all images are resized to the size 
             specified. Should be a list of two integers, like [640,480].
         
     Note that we will use the python indexing (labels start from 0).
     """
     if is_training:
         mat_filename = 'train_list.mat'
     else:
         mat_filename = 'test_list.mat'
     if mpi.is_root():
         matfile = io.loadmat(os.path.join(root, mat_filename))
         labels = np.array(matfile['labels'].flatten() - 1, dtype=np.int)
         files = [f[0][0] for f in matfile['file_list']]
     else:
         labels = None
         files = None
     self._data = mpi.distribute_list(files)
     self._label = mpi.distribute(labels)
     self._root = root
     self._prefetch = prefetch
     self._crop = crop
     self._target_size = target_size
     if target_size is not None:
         self._dim = tuple(target_size) + (3, )
     else:
         self._dim = False
     self._channels = 3
     if self._prefetch:
         self._data = [self._read(i) for i in range(len(self._data))]
Ejemplo n.º 5
0
 def __init__(self, root, is_training, crop = False,
              prefetch = False, target_size = None):
     """Load the dataset.
     Input:
         root: the root folder of the CUB_200_2011 dataset.
         is_training: if true, load the training data. Otherwise, load the
             testing data.
         crop: if False, does not crop the bounding box. If a real value,
             crop is the ratio of the bounding box that gets cropped.
             e.g., if crop = 1.5, the resulting image will be 1.5 * the
             bounding box area.
         prefetch: if True, the images are prefetched to avoid disk read. If
             you have a large number of images, prefetch would require a lot
             of memory.
         target_size: if provided, all images are resized to the size 
             specified. Should be a list of two integers, like [640,480].
         
     Note that we will use the python indexing (labels start from 0).
     """
     if is_training:
         mat_filename = 'train_list.mat'
     else:
         mat_filename = 'test_list.mat'
     if mpi.is_root():
         matfile = io.loadmat(os.path.join(root, mat_filename))
         labels = np.array(matfile['labels'].flatten()-1, dtype=np.int)
         files = [f[0][0] for f in matfile['file_list']]
     else:
         labels = None
         files = None
     self._data = mpi.distribute_list(files)
     self._label = mpi.distribute(labels)
     self._root = root
     self._prefetch = prefetch
     self._crop = crop
     self._target_size = target_size
     if target_size is not None:
         self._dim = tuple(target_size) + (3,)
     else:
         self._dim = False
     self._channels = 3
     if self._prefetch:
         self._data = [self._read(i) for i in range(len(self._data))]
Ejemplo n.º 6
0
 def __init__(self, root, is_training, crop = False, subset = None, 
              prefetch = False, target_size = None):
     """Load the dataset.
     Input:
         root: the root folder of the CUB_200_2011 dataset.
         is_training: if true, load the training data. Otherwise, load the
             testing data.
         crop: if False, does not crop the bounding box. If a real value,
             crop is the ratio of the bounding box that gets cropped.
             e.g., if crop = 1.5, the resulting image will be 1.5 * the
             bounding box area.
         subset: if nonempty, we will only use the subset specified in the
             list. The content of the list should be class subfolder names, 
             like ['001.Black_footed_Albatross', ...]
         prefetch: if True, the images are prefetched to avoid disk read. If
             you have a large number of images, prefetch would require a lot
             of memory.
         target_size: if provided, all images are resized to the size 
             specified. Should be a list of two integers, like [640,480].
         
     Note that we will use the python indexing (labels start from 0).
     """
     super(CUBDataset, self).__init__()
     images = [line.split()[1] for line in
                 open(os.path.join(root, 'images.txt'), 'r')]
     boxes = [line.split()[1:] for line in
                 open(os.path.join(root, 'bounding_boxes.txt'),'r')]
     labels = [int(line.split()[1]) - 1 for line in
                 open(os.path.join(root, 'image_class_labels.txt'), 'r')]
     classnames = [line.split()[1] for line in
                 open(os.path.join(root, 'classes.txt'),'r')]
     class2id = dict(zip(classnames, range(len(classnames))))
     split = [int(line.split()[1]) for line in
                 open(os.path.join(root, 'train_test_split.txt'),'r')]
     # load parts
     rawparts = np.loadtxt(os.path.join(root, 'parts','part_locs.txt'))
     rawparts = rawparts[:,2:-1].reshape((len(images), 15, 2))
     if subset is not None:
         # create the subset mapping
         old2new = {}
         selected_ids = set()
         for new_id, name in enumerate(subset):
             old_id = class2id[name]
             selected_ids.add(old_id)
             old2new[old_id] = new_id
         # select the subset
         is_selected = [(label in selected_ids) for label in labels]
         images = [image for image, val in zip(images, is_selected) if val]
         boxes = [box for box, val in zip(boxes, is_selected) if val]
         labels = [old2new[label] for label, val in zip(labels, is_selected) \
                   if val]
         classnames = subset
         class2id = dict(zip(classnames, range(len(classnames))))
         split = [trte for trte, val in zip(split, is_selected) if val]
         rawparts = rawparts[np.asarray(is_selected, dtype=bool)]
     # now, do training testing split
     if is_training:
         target = 1
     else:
         target = 0
     images = [os.path.join(root, 'images', image)
               for image, val in zip(images, split) if val == target]
     boxes = [box for box, val in zip(boxes, split) if val == target]
     labels = [label for label, val in zip(labels, split) if val == target]
     rawparts = rawparts[np.asarray(split)==target] - 1
     # store the necessary values
     self._data = mpi.distribute_list(images)
     # for the boxes, we store them as a numpy array
     self._boxes = np.array(mpi.distribute_list(boxes)).astype(float)
     self._boxes -= 1
     self._parts = mpi.distribute(rawparts)
     self._prefetch = prefetch
     self._target_size = target_size
     self._crop = crop
     if target_size is not None:
         self._dim = tuple(target_size) + (3,)
     else:
         self._dim = False
     self._channels = 3
     # we store the raw dimensions for part location computation
     self._raw_dimension = np.zeros((len(self._data),2), dtype=int)
     if prefetch:
         self._data = [self._read(i) for i in range(len(self._data))]
     self._label = mpi.distribute_list(labels)
     self._classnames = mpi.COMM.bcast(classnames)
Ejemplo n.º 7
0
 def __init__(self,
              root_folder,
              extensions,
              prefetch=False,
              target_size=None,
              max_size=None,
              min_size=None,
              center_crop=None):
     """ Initialize from a two-layer storage
     Input:
         root_folder: the root that contains the data. Under root_folder
             there should be a list of folders, under which there should be
             a list of files
         extensions: the list of extensions that should be used to filter the
             files. Should be like ['png', 'jpg']. It's case insensitive.
         prefetch: if True, the images are prefetched to avoid disk read. If
             you have a large number of images, prefetch would require a lot
             of memory.
         target_size, max_size, min_size, center_crop: see manipulate() for
             details.
     """
     super(TwoLayerDataset, self).__init__()
     if mpi.agree(not os.path.exists(root_folder)):
         raise OSError, "The specified folder does not exist."
     logging.debug('Loading from %s' % (root_folder, ))
     if type(extensions) is str:
         extensions = [extensions]
     extensions = set(extensions)
     if mpi.is_root():
         # get files first
         files = glob.glob(os.path.join(root_folder, '*', '*'))
         # select those that fits the extension
         files = [
             f for f in files
             if any([f.lower().endswith(ext) for ext in extensions])
         ]
         logging.debug("A total of %d images." % (len(files)))
         # get raw labels
         labels = [os.path.split(os.path.split(f)[0])[1] for f in files]
         classnames = list(set(labels))
         # sort so we get a reasonable class order
         classnames.sort()
         name2val = dict(zip(classnames, range(len(classnames))))
         labels = [name2val[label] for label in labels]
     else:
         files = None
         classnames = None
         labels = None
     mpi.barrier()
     self._rawdata = mpi.distribute_list(files)
     self._data = self._rawdata
     self._prefetch = prefetch
     self._target_size = target_size
     self._max_size = max_size
     self._min_size = min_size
     self._center_crop = center_crop
     if target_size != None:
         self._dim = tuple(target_size) + (3, )
     else:
         self._dim = False
     self._channels = 3
     if prefetch:
         self._data = [self._read(idx) for idx in range(len(self._data))]
     self._label = mpi.distribute_list(labels)
     self._classnames = mpi.COMM.bcast(classnames)
Ejemplo n.º 8
0
    def __init__(self, list_file, feat_range, posting_file, perc_pos, keep_full_utt=False, posting_sampler=None, min_dur=0.2, min_count=0.0, max_count=10000000.0, reader_type='utterance', 
                 pickle_fname=None, list_file_sph=None, kw_feat=None, merge_score_files=None):
        '''TODO: Read pieces of utterance from the CSV file instead to save memory. It would be nice to index thse by utt_id (by now I do a map).'''
        super(BabelDataset, self).__init__()
        if list_file.find('eval') >= 0:
            self.is_eval = True
            self.T = FLAGS.T_eval
        else:
            self.is_eval = False
            self.T = FLAGS.T_train
        self.beta = FLAGS.beta
        self.reader_type = reader_type
        if reader_type=='lattice':
            self.is_lattice = True
            utt_reader = LatticeReader.LatticeReader(list_file)
            utt_reader.ReadAllLatices()
        elif reader_type=='utterance':
            self.is_lattice = False
            utt_reader = UtteranceReader.UtteranceReader(list_file,pickle_fname=pickle_fname)
            utt_reader.ReadAllUtterances(feat_range)
        elif reader_type=='snr':
            self.is_lattice = False
            utt_reader = SNRReader.SNRReader(list_file,pickle_fname=pickle_fname)
            utt_reader.ReadAllSNR()
        elif reader_type=='srate':
            self.is_lattice = False
            utt_reader = SrateReader.SrateReader(list_file,pickle_fname=pickle_fname)
            utt_reader.ReadAllSrate()
        elif reader_type=='score':
            self.is_lattice = False
            utt_reader = ScoreReader.ScoreReader(list_file,list_file_sph=list_file_sph,pickle_fname=pickle_fname, merge_score_files=merge_score_files)
        else:
            print 'Reader not implemented!'
            exit(0)
        if posting_sampler == None:
            testParser = PostingParser.PostingParser(posting_file)
            self.posting_sampler = Sampler.Sampler(testParser)
            self.posting_sampler.GetPositive()
            self.posting_sampler.GetNegative()
            self.posting_sampler.SampleData(perc_pos)
        else:
            self.posting_sampler = posting_sampler
        self.min_dur = min_dur
        self._data_all = None
        self._dim = False
        self._channels = 1
        self.keep_full_utt = keep_full_utt
        if mpi.is_root():
            self._data = []
            self._label = []
            self._features = []
            self._utt_id = []
            self._times = []
            self._keyword = []
            skipped = 0
            for i in range(len(self.posting_sampler.negative_data)):
                if utt_reader.map_utt_idx.has_key(self.posting_sampler.negative_data[i]['file']):
                    if self.posting_sampler.negative_data[i]['sys_bt'] == '':
                        print 'We found a negative example that was not produced by the system!'
                        exit(0)
                    sys_bt = float(self.posting_sampler.negative_data[i]['sys_bt'])
                    sys_et = float(self.posting_sampler.negative_data[i]['sys_et'])
                    sys_sc = float(self.posting_sampler.negative_data[i]['sys_score'])
                    if(sys_et-sys_bt < self.min_dur):
                        skipped += 1
                        continue
                    self._data.append(utt_reader.GetKeywordData(self.posting_sampler.negative_data[i]['file'],
                                                              sys_bt, sys_et,kw=self.posting_sampler.negative_data[i]['termid']))
                    self._label.append(0)
                    self._features.append(sys_sc)
                    self._utt_id.append(self.posting_sampler.negative_data[i]['file'])
                    self._times.append((sys_bt,sys_et))
                    self._keyword.append(self.posting_sampler.negative_data[i]['termid'])
                else:
                    pass
            for i in range(len(self.posting_sampler.positive_data)):
                if utt_reader.map_utt_idx.has_key(self.posting_sampler.positive_data[i]['file']):
                    if self.posting_sampler.positive_data[i]['sys_bt'] == '':
                        sys_bt = 0
                        sys_et = None
                        sys_sc = -1.0
                        #print self.posting_sampler.positive_data[i]['alignment']
                        continue #Should just ignore these?
                    else:
                        sys_bt = float(self.posting_sampler.positive_data[i]['sys_bt'])
                        sys_et = float(self.posting_sampler.positive_data[i]['sys_et'])
                        sys_sc = float(self.posting_sampler.positive_data[i]['sys_score'])
                        if(sys_et-sys_bt < self.min_dur):
                            skipped += 1
                            continue
                    self._data.append(utt_reader.GetKeywordData(self.posting_sampler.positive_data[i]['file'],
                                                              sys_bt, sys_et,kw=self.posting_sampler.positive_data[i]['termid']))
                    self._label.append(1)
                    self._features.append(sys_sc)
                    self._utt_id.append(self.posting_sampler.positive_data[i]['file'])
                    self._times.append((sys_bt,sys_et))
                    self._keyword.append(self.posting_sampler.positive_data[i]['termid'])
                else:
                    pass
            
            print 'I skipped ',skipped,' entries out of ',(len(self.posting_sampler.negative_data)+len(self.posting_sampler.positive_data))
            
            self._label = np.array(self._label)
        else:
            self._data = None
            self._label = None
            self._features = None
            self._utt_id = None
            self._times = None
            self._keyword = None
        #populate true kw freq
        self._map_kw_counts = {}
        for i in range(len(self.posting_sampler.positive_data)):
            if utt_reader.map_utt_idx.has_key(self.posting_sampler.positive_data[i]['file']):
                kw = self.posting_sampler.positive_data[i]['termid']
                if self._map_kw_counts.has_key(kw):
                    self._map_kw_counts[kw] += 1
                else:
                    self._map_kw_counts[kw] = 1
        #filter dataset depending on count
        if mpi.is_root():
            ind_keep = []
            kw_zero = 0
            for i in range(len(self._keyword)):
                kw = self._keyword[i]
                kw_count = 0
                if self._map_kw_counts.has_key(kw):
                    kw_count = self._map_kw_counts[kw]
                else:
                    kw_zero += 1
                if kw_count <= max_count and kw_count >= min_count:
                    ind_keep.append(i)
            
            self._data = [self._data[i] for i in ind_keep]
            self._label = [self._label[i] for i in ind_keep]
            self._features = [self._features[i] for i in ind_keep]
            self._utt_id = [self._utt_id[i] for i in ind_keep]
            self._times = [self._times[i] for i in ind_keep]
            self._keyword = [self._keyword[i] for i in ind_keep]

                    
        self._data = mpi.distribute_list(self._data)
        self._label = mpi.distribute(self._label)
        self._features = mpi.distribute_list(self._features)
        self._utt_id = mpi.distribute_list(self._utt_id)
        self._times = mpi.distribute_list(self._times)
        self._keyword = mpi.distribute_list(self._keyword)
        if self.keep_full_utt == True:
            self.utt_reader = utt_reader
        if kw_feat != None:
            try:
                kw_feat.has_key('length')
                self.CopyKeywordMaps(kw_feat)
            except:
                self.LoadMappingHescii(FLAGS.hescii_file)
                self.ComputeKeywordMaps()
Ejemplo n.º 9
0
 def __init__(self, root, is_training, crop = False, subset = None, 
              prefetch = False, target_size = None, version = '2011'):
     """Load the dataset.
     Input:
         root: the root folder of the CUB_200_2011 dataset.
         is_training: if true, load the training data. Otherwise, load the
             testing data.
         crop: if False, does not crop the bounding box. If a real value,
             crop is the ratio of the bounding box that gets cropped.
             e.g., if crop = 1.5, the resulting image will be 1.5 * the
             bounding box area.
         subset: if nonempty, we will only use the subset specified in the
             list. The content of the list should be class subfolder names, 
             like ['001.Black_footed_Albatross', ...]
         prefetch: if True, the images are prefetched to avoid disk read. If
             you have a large number of images, prefetch would require a lot
             of memory.
         target_size: if provided, all images are resized to the size 
             specified. Should be a list of two integers, like [640,480].
         version: either '2011' or '2010'. Note that the 2011 version
             contains the parts, while the 2010 version does not.
         
     Note that we will use the python indexing (labels start from 0).
     """
     super(CUBDataset, self).__init__()
     if version == '2011':
         images = [line.split()[1] for line in
                     open(os.path.join(root, 'images.txt'), 'r')]
         boxes = [line.split()[1:] for line in
                     open(os.path.join(root, 'bounding_boxes.txt'),'r')]
         labels = [int(line.split()[1]) - 1 for line in
                     open(os.path.join(root, 'image_class_labels.txt'), 'r')]
         classnames = [line.split()[1] for line in
                     open(os.path.join(root, 'classes.txt'),'r')]
         class2id = dict(zip(classnames, range(len(classnames))))
         split = [int(line.split()[1]) for line in
                     open(os.path.join(root, 'train_test_split.txt'),'r')]
         # load parts
         rawparts = np.loadtxt(os.path.join(root, 'parts','part_locs.txt'))
         rawparts = rawparts[:,2:-1].reshape((len(images), 15, 2))
     elif version == '2010':
         # we are using version 2010. We load the data to mimic the 2011
         # version data format
         images = [line.strip() for line in
                     open(os.path.join(root, 'lists', 'files.txt'), 'r')]
         boxes = [] # TODO: get boxes
         # unfortunately, we need to load the boxes from matlab annotations
         for filename in images:
             matfile = io.loadmat(os.path.join(root, 'annotations-mat',
                                               filename[:-3]+'mat'))
             left, top, right, bottom = \
                     [matfile['bbox'][0][0][i][0][0] for i in range(4)]
             boxes.append([left, top, right-left, bottom-top])
         train_images = [line.strip() for line in
                     open(os.path.join(root, 'lists', 'train.txt'), 'r')]
         labels = [int(line[:line.find('.')]) - 1 for line in images]
         classnames = [line.strip() for line in
                     open(os.path.join(root, 'lists', 'classes.txt'),'r')]
         class2id = dict(zip(classnames, range(len(classnames))))
         split = [int(line in train_images) for line in images]
         # we do not have rawparts.
         rawparts = None
     else:
         raise ValueError, "Unrecognized version: %s" % version
     if subset is not None:
         # create the subset mapping
         old2new = {}
         selected_ids = set()
         for new_id, name in enumerate(subset):
             old_id = class2id[name]
             selected_ids.add(old_id)
             old2new[old_id] = new_id
         # select the subset
         is_selected = [(label in selected_ids) for label in labels]
         images = [image for image, val in zip(images, is_selected) if val]
         boxes = [box for box, val in zip(boxes, is_selected) if val]
         labels = [old2new[label] for label, val in zip(labels, is_selected) \
                   if val]
         classnames = subset
         class2id = dict(zip(classnames, range(len(classnames))))
         split = [trte for trte, val in zip(split, is_selected) if val]
         if rawparts is not None:
             rawparts = rawparts[np.asarray(is_selected, dtype=bool)]
     # now, do training testing split
     if is_training:
         target = 1
     else:
         target = 0
     images = [image for image, val in zip(images, split) if val == target]
     boxes = [box for box, val in zip(boxes, split) if val == target]
     labels = [label for label, val in zip(labels, split) if val == target]
     if rawparts is not None:
         rawparts = rawparts[np.asarray(split)==target] - 1
     # store the necessary values
     self._version = version
     self._root = root
     self._data = mpi.distribute_list(images)
     self._raw_name = self._data
     # for the boxes, we store them as a numpy array
     self._boxes = np.array(mpi.distribute_list(boxes)).astype(float)
     self._boxes -= 1
     if rawparts is not None:
         self._parts = mpi.distribute(rawparts)
     else:
         self._parts = None
     self._prefetch = prefetch
     self._target_size = target_size
     self._crop = crop
     if target_size is not None:
         self._dim = tuple(target_size) + (3,)
     else:
         self._dim = False
     self._channels = 3
     # we store the raw dimensions for part location computation
     self._raw_dimension = np.zeros((len(self._data),2), dtype=int)
     if prefetch:
         self._data = [self._read(i) for i in range(len(self._data))]
     self._label = mpi.distribute_list(labels)
     self._classnames = mpi.COMM.bcast(classnames)
Ejemplo n.º 10
0
mpi.log_level(logging.ERROR)
mpi.root_log_level(logging.INFO)

files = []
if mpi.is_root():
    if FLAGS.train != "":
        logging.info("Adding training images..")
        files += glob.glob(os.path.join(FLAGS.train, '*', '*.JPEG'))
    if FLAGS.val != "":
        logging.info("Adding validation images..")
        files += glob.glob(os.path.join(FLAGS.val, '*.JPEG'))
    if FLAGS.test != "":
        logging.info("Adding testing images..")
        files += glob.glob(os.path.join(FLAGS.test, '*.JPEG'))
    logging.info("A total of %d images to check" % (len(files)))
files = mpi.distribute_list(files)

logging.info('Validating...')
errornum = 0
for i, filename in enumerate(files):
    try:
        verify = Image.open(filename)
    except Exception, e:
        logging.error(filename)
        errornum += 1
errornum = mpi.COMM.allreduce(errornum)
if errornum == 0:
    logging.info("Done. No corrupted images found.")
else:
    logging.info("Done. %d corrupted images found." % (errornum, ))
Ejemplo n.º 11
0
 def __init__(self,
              root,
              is_training,
              crop=False,
              subset=None,
              prefetch=False,
              target_size=None,
              version='2011'):
     """Load the dataset.
     Input:
         root: the root folder of the CUB_200_2011 dataset.
         is_training: if true, load the training data. Otherwise, load the
             testing data.
         crop: if False, does not crop the bounding box. If a real value,
             crop is the ratio of the bounding box that gets cropped.
             e.g., if crop = 1.5, the resulting image will be 1.5 * the
             bounding box area.
         subset: if nonempty, we will only use the subset specified in the
             list. The content of the list should be class subfolder names, 
             like ['001.Black_footed_Albatross', ...]
         prefetch: if True, the images are prefetched to avoid disk read. If
             you have a large number of images, prefetch would require a lot
             of memory.
         target_size: if provided, all images are resized to the size 
             specified. Should be a list of two integers, like [640,480].
         version: either '2011' or '2010'. Note that the 2011 version
             contains the parts, while the 2010 version does not.
         
     Note that we will use the python indexing (labels start from 0).
     """
     super(CUBDataset, self).__init__()
     if version == '2011':
         images = [
             line.split()[1]
             for line in open(os.path.join(root, 'images.txt'), 'r')
         ]
         boxes = [
             line.split()[1:]
             for line in open(os.path.join(root, 'bounding_boxes.txt'), 'r')
         ]
         labels = [
             int(line.split()[1]) - 1 for line in open(
                 os.path.join(root, 'image_class_labels.txt'), 'r')
         ]
         classnames = [
             line.split()[1]
             for line in open(os.path.join(root, 'classes.txt'), 'r')
         ]
         class2id = dict(zip(classnames, range(len(classnames))))
         split = [
             int(line.split()[1]) for line in open(
                 os.path.join(root, 'train_test_split.txt'), 'r')
         ]
         # load parts
         rawparts = np.loadtxt(os.path.join(root, 'parts', 'part_locs.txt'))
         rawparts = rawparts[:, 2:-1].reshape((len(images), 15, 2))
     elif version == '2010':
         # we are using version 2010. We load the data to mimic the 2011
         # version data format
         images = [
             line.strip()
             for line in open(os.path.join(root, 'lists', 'files.txt'), 'r')
         ]
         boxes = []  # TODO: get boxes
         # unfortunately, we need to load the boxes from matlab annotations
         for filename in images:
             matfile = io.loadmat(
                 os.path.join(root, 'annotations-mat',
                              filename[:-3] + 'mat'))
             left, top, right, bottom = \
                     [matfile['bbox'][0][0][i][0][0] for i in range(4)]
             boxes.append([left, top, right - left, bottom - top])
         train_images = [
             line.strip()
             for line in open(os.path.join(root, 'lists', 'train.txt'), 'r')
         ]
         labels = [int(line[:line.find('.')]) - 1 for line in images]
         classnames = [
             line.strip() for line in open(
                 os.path.join(root, 'lists', 'classes.txt'), 'r')
         ]
         class2id = dict(zip(classnames, range(len(classnames))))
         split = [int(line in train_images) for line in images]
         # we do not have rawparts.
         rawparts = None
     else:
         raise ValueError, "Unrecognized version: %s" % version
     if subset is not None:
         # create the subset mapping
         old2new = {}
         selected_ids = set()
         for new_id, name in enumerate(subset):
             old_id = class2id[name]
             selected_ids.add(old_id)
             old2new[old_id] = new_id
         # select the subset
         is_selected = [(label in selected_ids) for label in labels]
         images = [image for image, val in zip(images, is_selected) if val]
         boxes = [box for box, val in zip(boxes, is_selected) if val]
         labels = [old2new[label] for label, val in zip(labels, is_selected) \
                   if val]
         classnames = subset
         class2id = dict(zip(classnames, range(len(classnames))))
         split = [trte for trte, val in zip(split, is_selected) if val]
         if rawparts is not None:
             rawparts = rawparts[np.asarray(is_selected, dtype=bool)]
     # now, do training testing split
     if is_training:
         target = 1
     else:
         target = 0
     images = [image for image, val in zip(images, split) if val == target]
     boxes = [box for box, val in zip(boxes, split) if val == target]
     labels = [label for label, val in zip(labels, split) if val == target]
     if rawparts is not None:
         rawparts = rawparts[np.asarray(split) == target] - 1
     # store the necessary values
     self._version = version
     self._root = root
     self._data = mpi.distribute_list(images)
     self._raw_name = self._data
     # for the boxes, we store them as a numpy array
     self._boxes = np.array(mpi.distribute_list(boxes)).astype(float)
     self._boxes -= 1
     if rawparts is not None:
         self._parts = mpi.distribute(rawparts)
     else:
         self._parts = None
     self._prefetch = prefetch
     self._target_size = target_size
     self._crop = crop
     if target_size is not None:
         self._dim = tuple(target_size) + (3, )
     else:
         self._dim = False
     self._channels = 3
     # we store the raw dimensions for part location computation
     self._raw_dimension = np.zeros((len(self._data), 2), dtype=int)
     if prefetch:
         self._data = [self._read(i) for i in range(len(self._data))]
     self._label = mpi.distribute_list(labels)
     self._classnames = mpi.COMM.bcast(classnames)
mpi.log_level(logging.ERROR)
mpi.root_log_level(logging.INFO)

files = []
if mpi.is_root():
    if FLAGS.train != "":
        logging.info("Adding training images..")
        files += glob.glob(os.path.join(FLAGS.train, '*', '*.JPEG'))
    if FLAGS.val != "":
        logging.info("Adding validation images..")
        files += glob.glob(os.path.join(FLAGS.val, '*.JPEG'))
    if FLAGS.test != "":
        logging.info("Adding testing images..")
        files += glob.glob(os.path.join(FLAGS.test, '*.JPEG'))
    logging.info("A total of %d images to check" % (len(files)))
files = mpi.distribute_list(files)

logging.info('Validating...')
errornum = 0
for i, filename in enumerate(files):
    try:
        verify = Image.open(filename)
    except Exception, e:
        logging.error(filename)
        errornum += 1
errornum = mpi.COMM.allreduce(errornum)
if errornum == 0:
    logging.info("Done. No corrupted images found.")
else:
    logging.info("Done. %d corrupted images found." % (errornum,))