def load_data(self): data_file = os.path.join( self.path, 'findata-' + str(self.nlags) + '-' + str(self.quick) + '.pkl') if os.path.exists(data_file): print("Loading cached data from %s" % data_file) (self.nfeats, self.train_x, self.train_y, self.valid_x, self.valid_y) = pickle.load(file(data_file)) return print("Processing data...") full = pd.read_hdf(os.path.join(self.path, self.filename), 'train') meds = full.median(axis=0) full.fillna(meds, inplace=True) cols = [ col for col in full.columns if col not in ['id', 'timestamp', 'y'] ] self.nfeats = len(cols) uniq_ts = full['timestamp'].unique() mid = uniq_ts[len(uniq_ts) / 2] train = full[full.timestamp < mid].reset_index() valid = full[full.timestamp >= mid].reset_index() if self.quick: train = train[train.id < 200].reset_index() valid = valid[valid.id < 200].reset_index() train_x, train_y = self.process(train, cols, self.nlags) valid_x, valid_y = self.process(valid, cols, self.nlags) self.train_x, self.train_y = self.shuffle(train_x, train_y) self.valid_x, self.valid_y = valid_x, valid_y pickle.dump((self.nfeats, self.train_x, self.train_y, self.valid_x, self.valid_y), file(data_file, 'w')) print("Saved data to %s" % data_file)
def run_voc_eval(annopath, imagesetfile, year, image_set, classes, output_dir): aps = [] # The PASCAL VOC metric changed in 2010 use_07_metric = True if int(year) < 2010 else False neon_logger.display('VOC07 metric? ' + ('Yes' if use_07_metric else 'No')) for i, cls in enumerate(classes): if cls == '__background__': continue filename = 'voc_{}_{}_{}.txt'.format( year, image_set, cls) filepath = os.path.join(output_dir, filename) rec, prec, ap = voc_eval(filepath, annopath, imagesetfile, cls, output_dir, ovthresh=0.5, use_07_metric=use_07_metric) aps += [ap] neon_logger.display('AP for {} = {:.4f}'.format(cls, ap)) with open(os.path.join(output_dir, cls + '_pr.pkl'), 'w') as f: pickle.dump({'rec': rec, 'prec': prec, 'ap': ap}, f, 2) neon_logger.display('Mean AP = {:.4f}'.format(np.mean(aps)))
def zca_whiten(train, test, cache=None): """ Use train set statistics to apply the ZCA whitening transform to both train and test sets. """ if cache and os.path.isfile(cache): with open(cache, 'rb') as f: (meanX, W) = pickle_load(f) else: meanX, W = CIFAR10._compute_zca_transform(train) if cache: logger.info("Caching ZCA transform matrix") with open(cache, 'wb') as f: pickle.dump((meanX, W), f, 2) logger.info("Applying ZCA whitening transform") train_w = np.dot(train - meanX, W) test_w = np.dot(test - meanX, W) return train_w, test_w
def run_voc_eval(annopath, imagesetfile, year, image_set, classes, output_dir): aps = [] # The PASCAL VOC metric changed in 2010 use_07_metric = True if int(year) < 2010 else False neon_logger.display('VOC07 metric? ' + ('Yes' if use_07_metric else 'No')) for i, cls in enumerate(classes): if cls == '__background__': continue filename = 'voc_{}_{}_{}.txt'.format(year, image_set, cls) filepath = os.path.join(output_dir, filename) rec, prec, ap = voc_eval(filepath, annopath, imagesetfile, cls, output_dir, ovthresh=0.5, use_07_metric=use_07_metric) aps += [ap] neon_logger.display('AP for {} = {:.4f}'.format(cls, ap)) with open(os.path.join(output_dir, cls + '_pr.pkl'), 'w') as f: pickle.dump({'rec': rec, 'prec': prec, 'ap': ap}, f, 2) neon_logger.display('Mean AP = {:.4f}'.format(np.mean(aps)))
def save_obj(obj, save_path): """ Dumps a python data structure to a saved on-disk representation. We currently support writing to the following file formats (expected filename extension in brackets): * python pickle (.pkl) Arguments: obj (object): the python object to be saved. save_path (str): Where to write the serialized object (full path and file name) See Also: :py:func:`~neon.models.model.Model.serialize` """ if save_path is None or len(save_path) == 0: return save_path = os.path.expandvars(os.path.expanduser(save_path)) logger.debug("serializing object to: %s", save_path) ensure_dirs_exist(save_path) pickle.dump(obj, open(save_path, 'wb'), 2)
def serialize(self, obj, save_path): fd = open(save_path, 'w') pickle.dump(obj, fd, -1) fd.close()
def build_data_train(path='.', filepath='labeledTrainData.tsv', vocab_file=None, vocab=None, skip_headers=True, train_ratio=0.8): """ Loads the data file and spits out a h5 file with record of {y, review_text, review_int} Typically two passes over the data. 1st pass is for vocab and pre-processing. (WARNING: to get phrases, we need to go though multiple passes). 2nd pass is converting text into integers. We will deal with integers from thereafter. WARNING: we use h5 just as proof of concept for handling large datasets Datasets may fit entirely in memory as numpy as array """ fname_h5 = filepath + '.h5' if vocab_file is None: fname_vocab = filepath + '.vocab' else: fname_vocab = vocab_file if not os.path.exists(fname_h5) or not os.path.exists(fname_vocab): # create the h5 store - NOTE: hdf5 is row-oriented store and we slice rows # reviews_text holds the metadata and processed text file # reviews_int holds the ratings, ints h5f = h5py.File(fname_h5, 'w') shape, maxshape = (2**16, ), (None, ) dt = np.dtype([ ('y', np.uint8), ('split', np.bool), ('num_words', np.uint16), # WARNING: vlen=bytes in python 3 ('text', h5py.special_dtype(vlen=str)) ]) reviews_text = h5f.create_dataset('reviews', shape=shape, maxshape=maxshape, dtype=dt, compression='gzip') reviews_train = h5f.create_dataset( 'train', shape=shape, maxshape=maxshape, dtype=h5py.special_dtype(vlen=np.int32), compression='gzip') reviews_valid = h5f.create_dataset( 'valid', shape=shape, maxshape=maxshape, dtype=h5py.special_dtype(vlen=np.int32), compression='gzip') wdata = np.zeros((1, ), dtype=dt) # init vocab only for train data build_vocab = False if vocab is None: vocab = defaultdict(int) build_vocab = True nsamples = 0 # open the file, skip the headers if needed f = open(filepath, 'r') if skip_headers: f.readline() for i, line in enumerate(f): _, rating, review = line.strip().split('\t') # clean the review review = clean_string(review) review_words = review.strip().split() num_words = len(review_words) split = int(np.random.rand() < train_ratio) # create record wdata['y'] = int(float(rating)) wdata['text'] = review wdata['num_words'] = num_words wdata['split'] = split reviews_text[i] = wdata # update the vocab if needed if build_vocab: for word in review_words: vocab[word] += 1 nsamples += 1 # histogram of class labels, sentence length ratings, counts = np.unique(reviews_text['y'][:nsamples], return_counts=True) sen_len, sen_len_counts = np.unique( reviews_text['num_words'][:nsamples], return_counts=True) vocab_size = len(vocab) nclass = len(ratings) reviews_text.attrs['vocab_size'] = vocab_size reviews_text.attrs['nrows'] = nsamples reviews_text.attrs['nclass'] = nclass reviews_text.attrs['class_distribution'] = counts neon_logger.display("vocabulary size - {}".format(vocab_size)) neon_logger.display("# of samples - {}".format(nsamples)) neon_logger.display("# of classes {}".format(nclass)) neon_logger.display("class distribution - {} {}".format( ratings, counts)) sen_counts = list(zip(sen_len, sen_len_counts)) sen_counts = sorted(sen_counts, key=lambda kv: kv[1], reverse=True) neon_logger.display("sentence length - {} {} {}".format( len(sen_len), sen_len, sen_len_counts)) # WARNING: assume vocab is of order ~4-5 million words. # sort the vocab , re-assign ids by its frequency. Useful for downstream tasks # only done for train data if build_vocab: vocab_sorted = sorted(list(vocab.items()), key=lambda kv: kv[1], reverse=True) vocab = {} for i, t in enumerate(list(zip(*vocab_sorted))[0]): vocab[t] = i # map text to integers ntrain = 0 nvalid = 0 for i in range(nsamples): text = reviews_text[i]['text'] y = int(reviews_text[i]['y']) split = reviews_text[i]['split'] text_int = [y] + [vocab[t] for t in text.strip().split()] if split: reviews_train[ntrain] = text_int ntrain += 1 else: reviews_valid[nvalid] = text_int nvalid += 1 reviews_text.attrs['ntrain'] = ntrain reviews_text.attrs['nvalid'] = nvalid neon_logger.display("# of train - {0}, # of valid - {1}".format( reviews_text.attrs['ntrain'], reviews_text.attrs['nvalid'])) # close open files h5f.close() f.close() if not os.path.exists(fname_vocab): rev_vocab = {} for wrd, wrd_id in vocab.items(): rev_vocab[wrd_id] = wrd neon_logger.display( "vocabulary from IMDB dataset is saved into {}".format( fname_vocab)) pickle.dump((vocab, rev_vocab), open(fname_vocab, 'wb'), 2) return fname_h5, fname_vocab
def voc_eval(detpath, annopath, imagesetfile, classname, cachedir, ovthresh=0.5, use_07_metric=False): """rec, prec, ap = voc_eval(detpath, annopath, imagesetfile, classname, [ovthresh], [use_07_metric]) Top level function that does the PASCAL VOC evaluation. detpath: Path to detections detpath.format(classname) should produce the detection results file. annopath: Path to annotations annopath.format(imagename) should be the xml annotations file. imagesetfile: Text file containing the list of images, one image per line. classname: Category name (duh) cachedir: Directory for caching the annotations [ovthresh]: Overlap threshold (default = 0.5) [use_07_metric]: Whether to use VOC07's 11 point AP computation (default False) """ # assumes detections are in detpath.format(classname) # assumes annotations are in annopath.format(imagename) # assumes imagesetfile is a text file with each line an image name # cachedir caches the annotations in a pickle file # first load gt if not os.path.isdir(cachedir): os.mkdir(cachedir) cachefile = os.path.join(cachedir, 'annots.pkl') # read list of images with open(imagesetfile, 'rb') as f: lines = f.readlines() imagenames = [x.strip() for x in lines] if not os.path.isfile(cachefile): # load annots recs = {} for i, imagename in enumerate(imagenames): recs[imagename] = parse_rec(annopath.format(imagename)) if i % 100 == 0: neon_logger.display('Reading annotation for {:d}/{:d}'.format( i + 1, len(imagenames))) # save neon_logger.display( 'Saving cached annotations to {:s}'.format(cachefile)) with open(cachefile, 'wb') as f: pickle.dump(recs, f, 2) else: # load with open(cachefile, 'rb') as f: recs = pickle.load(f) # extract gt objects for this class class_recs = {} npos = 0 for imagename in imagenames: R = [obj for obj in recs[imagename] if obj['name'] == classname] bbox = np.array([x['bbox'] for x in R]) difficult = np.array([x['difficult'] for x in R]).astype(np.bool) det = [False] * len(R) npos = npos + sum(~difficult) class_recs[imagename] = { 'bbox': bbox, 'difficult': difficult, 'det': det } # read dets detfile = detpath.format(classname) with open(detfile, 'rb') as f: lines = f.readlines() splitlines = [x.strip().split(' ') for x in lines] image_ids = [x[0] for x in splitlines] confidence = np.array([float(x[1]) for x in splitlines]) BB = np.array([[float(z) for z in x[2:]] for x in splitlines]) # sort by confidence sorted_ind = np.argsort(-confidence) # sorted_scores = np.sort(-confidence) BB = BB[sorted_ind, :] image_ids = [image_ids[x] for x in sorted_ind] # go down dets and mark TPs and FPs nd = len(image_ids) tp = np.zeros(nd) fp = np.zeros(nd) for d in range(nd): R = class_recs[image_ids[d]] bb = BB[d, :].astype(float) ovmax = -np.inf BBGT = R['bbox'].astype(float) if BBGT.size > 0: # compute overlaps # intersection ixmin = np.maximum(BBGT[:, 0], bb[0]) iymin = np.maximum(BBGT[:, 1], bb[1]) ixmax = np.minimum(BBGT[:, 2], bb[2]) iymax = np.minimum(BBGT[:, 3], bb[3]) iw = np.maximum(ixmax - ixmin + 1., 0.) ih = np.maximum(iymax - iymin + 1., 0.) inters = iw * ih # union uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) + (BBGT[:, 2] - BBGT[:, 0] + 1.) * (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters) overlaps = inters / uni ovmax = np.max(overlaps) jmax = np.argmax(overlaps) if ovmax > ovthresh: if not R['difficult'][jmax]: if not R['det'][jmax]: tp[d] = 1. R['det'][jmax] = 1 else: fp[d] = 1. else: fp[d] = 1. # compute precision recall fp = np.cumsum(fp) tp = np.cumsum(tp) rec = tp / float(npos) prec = tp / (tp + fp + 1e-10) ap = voc_ap(rec, prec, use_07_metric) return rec, prec, ap
def build_data_train(path='.', filepath='labeledTrainData.tsv', vocab_file=None, vocab=None, skip_headers=True, train_ratio=0.8): """ Loads the data file and spits out a h5 file with record of {y, review_text, review_int} Typically two passes over the data. 1st pass is for vocab and pre-processing. (WARNING: to get phrases, we need to go though multiple passes). 2nd pass is converting text into integers. We will deal with integers from thereafter. WARNING: we use h5 just as proof of concept for handling large datasets Datasets may fit entirely in memory as numpy as array """ fname_h5 = filepath + '.h5' if vocab_file is None: fname_vocab = filepath + '.vocab' else: fname_vocab = vocab_file if not os.path.exists(fname_h5) or not os.path.exists(fname_vocab): # create the h5 store - NOTE: hdf5 is row-oriented store and we slice rows # reviews_text holds the metadata and processed text file # reviews_int holds the ratings, ints h5f = h5py.File(fname_h5, 'w') shape, maxshape = (2 ** 16,), (None, ) dt = np.dtype([('y', np.uint8), ('split', np.bool), ('num_words', np.uint16), # WARNING: vlen=bytes in python 3 ('text', h5py.special_dtype(vlen=str)) ]) reviews_text = h5f.create_dataset('reviews', shape=shape, maxshape=maxshape, dtype=dt, compression='gzip') reviews_train = h5f.create_dataset( 'train', shape=shape, maxshape=maxshape, dtype=h5py.special_dtype(vlen=np.int32), compression='gzip') reviews_valid = h5f.create_dataset( 'valid', shape=shape, maxshape=maxshape, dtype=h5py.special_dtype(vlen=np.int32), compression='gzip') wdata = np.zeros((1, ), dtype=dt) # init vocab only for train data build_vocab = False if vocab is None: vocab = defaultdict(int) build_vocab = True nsamples = 0 # open the file, skip the headers if needed f = open(filepath, 'r') if skip_headers: f.readline() for i, line in enumerate(f): _, rating, review = line.strip().split('\t') # clean the review review = clean_string(review) review_words = review.strip().split() num_words = len(review_words) split = int(np.random.rand() < train_ratio) # create record wdata['y'] = int(float(rating)) wdata['text'] = review wdata['num_words'] = num_words wdata['split'] = split reviews_text[i] = wdata # update the vocab if needed if build_vocab: for word in review_words: vocab[word] += 1 nsamples += 1 # histogram of class labels, sentence length ratings, counts = np.unique( reviews_text['y'][:nsamples], return_counts=True) sen_len, sen_len_counts = np.unique( reviews_text['num_words'][:nsamples], return_counts=True) vocab_size = len(vocab) nclass = len(ratings) reviews_text.attrs['vocab_size'] = vocab_size reviews_text.attrs['nrows'] = nsamples reviews_text.attrs['nclass'] = nclass reviews_text.attrs['class_distribution'] = counts neon_logger.display("vocabulary size - {}".format(vocab_size)) neon_logger.display("# of samples - {}".format(nsamples)) neon_logger.display("# of classes {}".format(nclass)) neon_logger.display("class distribution - {} {}".format(ratings, counts)) sen_counts = list(zip(sen_len, sen_len_counts)) sen_counts = sorted(sen_counts, key=lambda kv: kv[1], reverse=True) neon_logger.display("sentence length - {} {} {}".format(len(sen_len), sen_len, sen_len_counts)) # WARNING: assume vocab is of order ~4-5 million words. # sort the vocab , re-assign ids by its frequency. Useful for downstream tasks # only done for train data if build_vocab: vocab_sorted = sorted( list(vocab.items()), key=lambda kv: kv[1], reverse=True) vocab = {} for i, t in enumerate(zip(*vocab_sorted)[0]): vocab[t] = i # map text to integers ntrain = 0 nvalid = 0 for i in range(nsamples): text = reviews_text[i]['text'] y = int(reviews_text[i]['y']) split = reviews_text[i]['split'] text_int = [y] + [vocab[t] for t in text.strip().split()] if split: reviews_train[ntrain] = text_int ntrain += 1 else: reviews_valid[nvalid] = text_int nvalid += 1 reviews_text.attrs['ntrain'] = ntrain reviews_text.attrs['nvalid'] = nvalid neon_logger.display( "# of train - {0}, # of valid - {1}".format(reviews_text.attrs['ntrain'], reviews_text.attrs['nvalid'])) # close open files h5f.close() f.close() if not os.path.exists(fname_vocab): rev_vocab = {} for wrd, wrd_id in vocab.items(): rev_vocab[wrd_id] = wrd neon_logger.display("vocabulary from IMDB dataset is saved into {}".format(fname_vocab)) pickle.dump((vocab, rev_vocab), open(fname_vocab, 'wb'), 2) return fname_h5, fname_vocab
def my_pickle(filename, data): with open(filename, "w") as fo: pickle.dump(data, fo, protocol=pickle.HIGHEST_PROTOCOL)
def voc_eval(detpath, annopath, imagesetfile, classname, cachedir, ovthresh=0.5, use_07_metric=False): """rec, prec, ap = voc_eval(detpath, annopath, imagesetfile, classname, [ovthresh], [use_07_metric]) Top level function that does the PASCAL VOC evaluation. detpath: Path to detections detpath.format(classname) should produce the detection results file. annopath: Path to annotations annopath.format(imagename) should be the xml annotations file. imagesetfile: Text file containing the list of images, one image per line. classname: Category name (duh) cachedir: Directory for caching the annotations [ovthresh]: Overlap threshold (default = 0.5) [use_07_metric]: Whether to use VOC07's 11 point AP computation (default False) """ # assumes detections are in detpath.format(classname) # assumes annotations are in annopath.format(imagename) # assumes imagesetfile is a text file with each line an image name # cachedir caches the annotations in a pickle file # first load gt if not os.path.isdir(cachedir): os.mkdir(cachedir) cachefile = os.path.join(cachedir, 'annots.pkl') # read list of images with open(imagesetfile, 'rb') as f: lines = f.readlines() imagenames = [x.strip() for x in lines] if not os.path.isfile(cachefile): # load annots recs = {} for i, imagename in enumerate(imagenames): recs[imagename] = parse_rec(annopath.format(imagename)) if i % 100 == 0: neon_logger.display('Reading annotation for {:d}/{:d}'.format( i + 1, len(imagenames))) # save neon_logger.display( 'Saving cached annotations to {:s}'.format(cachefile)) with open(cachefile, 'wb') as f: pickle.dump(recs, f, 2) else: # load with open(cachefile, 'rb') as f: recs = pickle.load(f) # extract gt objects for this class class_recs = {} npos = 0 for imagename in imagenames: R = [obj for obj in recs[imagename] if obj['name'] == classname] bbox = np.array([x['bbox'] for x in R]) difficult = np.array([x['difficult'] for x in R]).astype(np.bool) det = [False] * len(R) npos = npos + sum(~difficult) class_recs[imagename] = {'bbox': bbox, 'difficult': difficult, 'det': det} # read dets detfile = detpath.format(classname) with open(detfile, 'rb') as f: lines = f.readlines() splitlines = [x.strip().split(' ') for x in lines] image_ids = [x[0] for x in splitlines] confidence = np.array([float(x[1]) for x in splitlines]) BB = np.array([[float(z) for z in x[2:]] for x in splitlines]) # sort by confidence sorted_ind = np.argsort(-confidence) # sorted_scores = np.sort(-confidence) BB = BB[sorted_ind, :] image_ids = [image_ids[x] for x in sorted_ind] # go down dets and mark TPs and FPs nd = len(image_ids) tp = np.zeros(nd) fp = np.zeros(nd) for d in range(nd): R = class_recs[image_ids[d]] bb = BB[d, :].astype(float) ovmax = -np.inf BBGT = R['bbox'].astype(float) if BBGT.size > 0: # compute overlaps # intersection ixmin = np.maximum(BBGT[:, 0], bb[0]) iymin = np.maximum(BBGT[:, 1], bb[1]) ixmax = np.minimum(BBGT[:, 2], bb[2]) iymax = np.minimum(BBGT[:, 3], bb[3]) iw = np.maximum(ixmax - ixmin + 1., 0.) ih = np.maximum(iymax - iymin + 1., 0.) inters = iw * ih # union uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) + (BBGT[:, 2] - BBGT[:, 0] + 1.) * (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters) overlaps = inters / uni ovmax = np.max(overlaps) jmax = np.argmax(overlaps) if ovmax > ovthresh: if not R['difficult'][jmax]: if not R['det'][jmax]: tp[d] = 1. R['det'][jmax] = 1 else: fp[d] = 1. else: fp[d] = 1. # compute precision recall fp = np.cumsum(fp) tp = np.cumsum(tp) rec = tp / float(npos) prec = tp / (tp + fp + 1e-10) ap = voc_ap(rec, prec, use_07_metric) return rec, prec, ap