def load(subset=None, min_occ=1, min_files=1): """ Loads the raw text data that constitutes the Microsoft Sentence Completion Challenge (stored in ./data/). Processes the data, tokenizes and parses it, and returns the results. Returned is a tuple (train_sents, question_groups, answers, feature_sizes). The 'train_sents' numpy array of shape (token_count, feature_count). Features colums are at first textual (orth, lemma, lemma_4), then syntactic (pos, dependency-type). The [-2] column contains syntactic-parent-indices, and the [-1] column denotes to which sentence the token belongs. The 'question_groups' object is an iterable question groups. Each group consists of 5 sentences (one of which is correct). Each sentence is a parsed-text-array as described above. The 'answers' object is a numpy array of shape (question_group_count, ) that contains the indices of the correct sentences in question groups. The 'feature_sizes' object is a numpy array contaning the dimensionality of each feature. :param subset: The number of training files to process. If None (default), all of the files are processed. :param min_occ: Miniumum required number of occurences of a token (word) required for it to be included in the vocabulary. Default value (1) uses all words that occured in the trainset. :param min_files: Minumum required number of files a term has to occur in for it to be included in the vocabulary. """ dir = os.path.join("data", "processed") if not os.path.exists(dir): os.makedirs(dir) name_base = "subset_%r-min_occ_%r-min_files_%r" % (subset, min_occ, min_files) # look for the cached processed data, return if present file_name = os.path.join(dir, name_base + ".pkl") data = util.try_pickle_load(file_name) if data is not None: return data # did not find cached data, will have to process it # log the loading process also to a file log_name = os.path.join(dir, name_base + ".log") log.addHandler(logging.FileHandler(log_name)) # process the data, cache it and return data = _load(subset, min_occ, min_files) util.try_pickle_dump(data, file_name) return data
def main(x_path, y_path): x = try_pickle_load(x_path) y = try_pickle_load(y_path) print "Shape of loaded x data is", x.shape print "Shape of loaded y data is", y.shape assert (x.shape[0] == y.shape[0]) test_size = int(x.shape[0] * TEST_SIZE) train_size = x.shape[0] - test_size assert (train_size + test_size == x.shape[0]) print "Train size", train_size print "Test size", test_size indices = np.arange(x.shape[0]) np.random.shuffle(indices) train_ind = indices[:train_size] test_ind = indices[train_size:] train_set_x = x[train_ind] test_set_x = x[test_ind] train_set_y = y[train_ind] test_set_y = y[test_ind] folder_name = os.path.split(x_path)[0] print "Folder to save", folder_name try_pickle_dump(train_set_x, os.path.join(folder_name, "x_train.bin")) try_pickle_dump(test_set_x, os.path.join(folder_name, "x_test.bin")) try_pickle_dump(train_set_y, os.path.join(folder_name, "y_train.bin")) try_pickle_dump(test_set_y, os.path.join(folder_name, "y_test.bin")) print "Done"
def main(x_path, y_path): x = try_pickle_load(x_path) y = try_pickle_load(y_path) print "Shape of loaded x data is", x.shape print "Shape of loaded y data is", y.shape assert(x.shape[0] == y.shape[0]) test_size = int(x.shape[0] * TEST_SIZE) train_size = x.shape[0] - test_size assert(train_size + test_size == x.shape[0]) print "Train size", train_size print "Test size", test_size indices = np.arange(x.shape[0]) np.random.shuffle(indices) train_ind = indices[:train_size] test_ind = indices[train_size:] train_set_x = x[train_ind] test_set_x = x[test_ind] train_set_y = y[train_ind] test_set_y = y[test_ind] folder_name = os.path.split(x_path)[0] print "Folder to save", folder_name try_pickle_dump(train_set_x, os.path.join(folder_name, "x_train.bin")) try_pickle_dump(test_set_x, os.path.join(folder_name, "x_test.bin")) try_pickle_dump(train_set_y, os.path.join(folder_name, "y_train.bin")) try_pickle_dump(test_set_y, os.path.join(folder_name, "y_test.bin")) print "Done"
def faces(fold): """ Retrieves a list of face images. Images are numpy arrays of (img_height, img_width, RGB) shape. The images represent the clipped and masked face images from the given fold of the FDDB database. :param fold: int indicating which fold is desired. In [1, 10] range. """ log.info("Retrieving face images for fold %s", str(fold)) # generate file name in which this fold's face images are stored faces_file_name = os.path.join( FACE_ONLY_ROOT, "fddb_facesonly_fold_{:02d}.zip".format(fold)) # try to load and return pickled data face_images = util.try_pickle_load(faces_file_name, zip=False) if face_images is not None: return face_images # resulting face images # each image is a numpy array of RGB components of # (img_height, img_width, 3) shape face_images = [] # go through all the photos in the fold # and their FDDB elipsis info (face annotations) for photo_path, (masks, bboxes) in image_face_masks_bboxes(fold).items(): log.info("Processing photo %s", photo_path) # load photo log.debug("Loading photo") photo_RGB = cv2.imread(photo_path, 1) # for each elipse info get mask and bbox for mask, bbox in zip(masks, bboxes): # apply the bounding box log.debug("Applying mask and bounds") face_img = np.array(photo_RGB[bbox[0][0]:bbox[1][0], bbox[0][1]:bbox[1][1], :]) # apply the mask face_mask = mask[bbox[0][0]:bbox[1][0], bbox[0][1]:bbox[1][1]] face_img[np.logical_not(face_mask), :] = 0 # store the image face_images.append(face_img) # store image data for subsequent usage if not util.try_pickle_dump(face_images, faces_file_name, zip=False): raise "Failed to pickle face images" return face_images
def image_face_masks_bboxes(fold): """ Returns a dictionary in which keys are file paths of images belonging to the fold. Values are tuples (masks, bboxes) where "masks" are lists of face-elipse booleam masks for that image and "bboxes" are bounding box info for that image. The returned dictionary is ordered the same way the elisis info file is. """ log.info("Retrieving image masks for fold %s", str(fold)) # file name of the cached version masks_file_name = os.path.join( FACE_MASK_ROOT, "fddb_face_masks_fold{:02d}.zip".format(fold)) # try to load and return pickled data masks = util.try_pickle_load(masks_file_name, zip=False) if masks is not None: return masks # there is no pickled version, we need to create the masks masks_dict = collections.OrderedDict() for photo_path, elipses in image_elipses(fold).items(): log.info("Processing photo %s", photo_path) # load photo log.debug("Loading photo") photo_RGB = cv2.imread(photo_path, 1) photo_shape = photo_RGB.shape[:2] # for each elipse info get mask and bbox, and store them # first prepare the numpy arrays in which they are stored masks = np.zeros((len(elipses), photo_shape[0], photo_shape[1]), dtype=np.bool) bboxes = np.zeros((len(elipses), 2, 2), dtype=np.int32) # then out those arrays into the dict masks_dict[photo_path] = (masks, bboxes) # and then fill up the arrays with real data for elipse_ind, elipse in enumerate(elipses): log.debug("Calculating mask and bounds") mask, bbox = __elipsis_mask_and_box(photo_shape, elipse) masks[elipse_ind] = mask bboxes[elipse_ind] = bbox # store image data for subsequent usage if not util.try_pickle_dump(masks_dict, masks_file_name, zip=False): raise "Failed to pickle face masks" return masks_dict
def main(show=False): logger.info("... loading data") logger.debug("Theano.config.floatX is %s" % theano.config.floatX) # samples is list of Sample objects samples = load_dataset(DATASET_PATH) samples = list(samples) # use only subset of data TODO remove this # DATA_TO_USE = 60 # samples = samples[:DATA_TO_USE] random.seed(23455) random.shuffle(samples) train_samples, test_samples = split_samples(samples, 0.1) del samples cc = ClassCounter() x_train = generate_x(train_samples) x_test = generate_x(test_samples) y_train = generate_targets(train_samples, cc) y_test = generate_targets(test_samples, cc) del train_samples del test_samples cc.log_stats() try_pickle_dump(x_train, OUT_PATH + "x_train.bin") try_pickle_dump(x_test, OUT_PATH + "x_test.bin") try_pickle_dump(y_train, OUT_PATH + "y_train.bin") try_pickle_dump(y_test, OUT_PATH + "y_test.bin") # print x_train[0][0, 0, 80:90, 80:90] # print x_test[0][0, 0, 80:90, 80:90] if show: n_imgs = 5 for j in xrange(n_imgs): pylab.subplot(2, n_imgs, 0 * n_imgs + j + 1) pylab.axis('off') pylab.imshow(x_train[0][j, 0, :, :]) # rgb for j in xrange(n_imgs): pylab.subplot(2, n_imgs, 1 * n_imgs + j + 1) pylab.gray() pylab.axis('off') pylab.imshow(y_train[j, :, :]) pylab.show()
def histograms(fold): """ Generates YIQ component histograms for face and not-face parts of the images of the given fold(s) of the FDDB database. Returns a tuple (hist_face, hist_noface). :type fold: int or iterable of ints :param fold: When int: number of the fold of the FDDB database. When iterable: a number of folds for the FDDB database. """ if not isinstance(fold, int): # fold param is an iterable # get individual fold histograms hists_face, hists_noface = zip(*[histograms(f) for f in fold]) # sum them up and return fold_count = len(hists_face) hist_face = sum(hists_face[1:], hists_face[0]) / fold_count hist_noface = sum(hists_noface[1:], hists_noface[0]) / fold_count return (hist_face, hist_noface) # generate file name in which this fold's histograms are stored hist_file_name = os.path.join( HIST_ROOT, "fddb_YIQ_histogram_fold_{:02d}.pkl".format(fold)) # try to load and return pickled histogram data pickled_hist = util.try_pickle_load(hist_file_name) if pickled_hist is not None: return pickled_hist # failed to load pickled data, create histograms # prepare histograms # first dimension indicates Y, I or Q, # second dimension are bins hist_face = np.zeros((3, 256), np.int) hist_noface = np.zeros((3, 256), np.int) # go through all the photos in the fold # and their FDDB elipsis info (face annotations) for photo_path, (masks, bboxes) in image_face_masks_bboxes(fold).items(): log.info("Processing photo %s", photo_path) # load photo, convert to YIO log.debug("Loading photo") photo_RGB = cv2.imread(photo_path, 1) log.debug("Converting to YIQ") photo_YIQ = util.rgb_to_yiq(photo_RGB) # create masks from elipses and OR them into one mask log.debug("Creating faces mask") mask_face = masks.any(axis=0) mask_noface = np.logical_not(mask_face) # add current image histograms to total histograms log.debug("Histogramming") for component_ind in range(3): hist_face[component_ind, :] += np.histogram( photo_YIQ[mask_face, component_ind], __bin_edges[component_ind] )[0] hist_noface[component_ind, :] += np.histogram( photo_YIQ[mask_noface, component_ind], __bin_edges[component_ind] )[0] # normalize histograms hist_face = hist_face.astype(np.float) / hist_face[1, :].sum() hist_noface = hist_noface.astype(np.float) / hist_noface[1, :].sum() # store histogram data for subsequent usage if not util.try_pickle_dump((hist_face, hist_noface), hist_file_name): raise "Failed to pickle histograms" return (hist_face, hist_noface)
def histograms(fold): """ Generates YIQ component histograms for face and not-face parts of the images of the given fold(s) of the FDDB database. Returns a tuple (hist_face, hist_noface). :type fold: int or iterable of ints :param fold: When int: number of the fold of the FDDB database. When iterable: a number of folds for the FDDB database. """ if not isinstance(fold, int): # fold param is an iterable # get individual fold histograms hists_face, hists_noface = zip(*[histograms(f) for f in fold]) # sum them up and return fold_count = len(hists_face) hist_face = sum(hists_face[1:], hists_face[0]) / fold_count hist_noface = sum(hists_noface[1:], hists_noface[0]) / fold_count return (hist_face, hist_noface) # generate file name in which this fold's histograms are stored hist_file_name = os.path.join( HIST_ROOT, "fddb_YIQ_histogram_fold_{:02d}.pkl".format(fold)) # try to load and return pickled histogram data pickled_hist = util.try_pickle_load(hist_file_name) if pickled_hist is not None: return pickled_hist # failed to load pickled data, create histograms # prepare histograms # first dimension indicates Y, I or Q, # second dimension are bins hist_face = np.zeros((3, 256), np.int) hist_noface = np.zeros((3, 256), np.int) # go through all the photos in the fold # and their FDDB elipsis info (face annotations) for photo_path, (masks, bboxes) in image_face_masks_bboxes(fold).items(): log.info("Processing photo %s", photo_path) # load photo, convert to YIO log.debug("Loading photo") photo_RGB = cv2.imread(photo_path, 1) log.debug("Converting to YIQ") photo_YIQ = util.rgb_to_yiq(photo_RGB) # create masks from elipses and OR them into one mask log.debug("Creating faces mask") mask_face = masks.any(axis=0) mask_noface = np.logical_not(mask_face) # add current image histograms to total histograms log.debug("Histogramming") for component_ind in range(3): hist_face[component_ind, :] += np.histogram( photo_YIQ[mask_face, component_ind], __bin_edges[component_ind])[0] hist_noface[component_ind, :] += np.histogram( photo_YIQ[mask_noface, component_ind], __bin_edges[component_ind])[0] # normalize histograms hist_face = hist_face.astype(np.float) / hist_face[1, :].sum() hist_noface = hist_noface.astype(np.float) / hist_noface[1, :].sum() # store histogram data for subsequent usage if not util.try_pickle_dump((hist_face, hist_noface), hist_file_name): raise "Failed to pickle histograms" return (hist_face, hist_noface)
def main(conf, gen_func, n_layers, show=False): """ conf: dictionary configuration dictionary, from json file gen_func: function function used for generating inputs to network n_layers: int number of layers of laplacian pyramid used as input show: bool if true, few parsed images will be shown as a result """ logger.info("... loading data") logger.debug("Theano.config.floatX is %s" % theano.config.floatX) # samples is list of Sample objects dataset_path = conf['training']['dataset-folder'] samples = load_dataset(dataset_path) samples = list(samples) random.seed(conf['training']['shuffle-seed']) random.shuffle(samples) validation_size = float(conf['training']['validation-percent']) / 100.0 train_samples, validation_samples = split_samples(samples, validation_size) del samples out_folder = conf['training']['out-folder'] write_samples_log(train_samples, os.path.join(out_folder, "samples_train.log")) write_samples_log(validation_samples, os.path.join(out_folder, "samples_validation.log")) cc = ClassCounter() x_train = generate_x(train_samples, n_layers, gen_func) x_validation = generate_x(validation_samples, n_layers, gen_func) y_train = generate_targets(train_samples, cc) y_validation = generate_targets(validation_samples, cc) del train_samples del validation_samples try_pickle_dump(x_train, os.path.join(out_folder, "x_train.bin")) try_pickle_dump(x_validation, os.path.join(out_folder, "x_validation.bin")) try_pickle_dump(y_train, os.path.join(out_folder, "y_train.bin")) try_pickle_dump(y_validation, os.path.join(out_folder, "y_validation.bin")) # if test data defined if 'test' in conf: logger.info("Found test configuration, generating test data") test_samples = load_dataset(conf['test']['dataset-folder']) test_samples = list(test_samples) write_samples_log(test_samples, os.path.join(out_folder, "samples_test.log")) x_test = generate_x(test_samples, n_layers, gen_func) y_test = generate_targets(test_samples, cc) try_pickle_dump(x_test, os.path.join(out_folder, "x_test.bin")) try_pickle_dump(y_test, os.path.join(out_folder, "y_test.bin")) cc.log_stats() if show: # show few parsed samples from train set n_imgs = 5 for j in xrange(n_imgs): pylab.subplot(3, n_imgs, 0 * n_imgs + j + 1) pylab.axis('off') pylab.imshow(x_train[0][j, 0, :, :]) # Y for j in xrange(n_imgs): pylab.subplot(3, n_imgs, 1 * n_imgs + j + 1) pylab.gray() pylab.axis('off') pylab.imshow(x_train[0][j, 3, :, :]) # depth for j in xrange(n_imgs): pylab.subplot(3, n_imgs, 2 * n_imgs + j + 1) pylab.gray() pylab.axis('off') pylab.imshow(y_train[j, :, :]) pylab.show()
def eval_model(conf, train_fn, test_fn, n_train_batches, n_test_batches, layers, pre_fn=None, l_rate_wrapper=None): """ Function for trainig and validating models n_epochs: dictionary configuration params train_fn: theano function training function test_fn: theano function validation function n_train_batches: int number of batches for training n_test_batches: int number of batches for validation layers: list list of layers, used to extract best params pre_fn: function function to be called before training l_rate_wrapper: UpdateParameters object learning rate wrapper object returns: (best_validation_error, best_iter, best_params) the best validation error, iteration and parameters """ assert (type(conf) is dict) n_epochs = conf['epochs'] if n_epochs < 0: n_epochs = maxint # how often to lower learning rate if no improvement epochs_check_learn_rate = None if 'learning-rate-decrease-params' in conf: lrdp_params = conf['learning-rate-decrease-params'] epochs_check_learn_rate = lrdp_params['no-improvement-epochs'] min_learning_rate = lrdp_params['min-learning-rate'] # file for dumping weights now = datetime.now() weights_filename = "network-%d-%d.bin" % (now.hour, now.minute) logger.info('... training') # early-stopping parameters # look as this many iterations regardless patience = n_train_batches * 20 # skip first 20 epochs # wait this much longer when a new best is found patience_increase = 1.1 # a relative improvement of this much is considered significant improvement_threshold = 0.998 # go through this many minibatche before checking the network # on the validation set; in this case we check every epoch validation_frequency = min(n_train_batches, patience / 2) logger.debug('Validation frequency is %d' % validation_frequency) set_layers_training_mode(layers, 1) best_validation_loss = numpy.inf best_iter = 0 best_epoch = 0 # best epoch for train cost best_params = [] best_train_cost = numpy.inf epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 # function to be called before every epoch if pre_fn is not None: pre_fn() training_costs = numpy.zeros((n_train_batches), dtype='float32') for minibatch_index in xrange(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index cost_ij = train_fn(minibatch_index) training_costs[minibatch_index] = cost_ij # logger.info('training @ iter = %d, cost %f' % (iter, cost_ij)) stdout.write('.') stdout.flush() if (iter + 1) % validation_frequency == 0: stdout.write('\n') # newline after iteration dots set_layers_training_mode(layers, 0) # compute zero-one loss on validation set validation = [test_fn(i) for i in xrange(n_test_batches)] set_layers_training_mode(layers, 1) validation_losses = [v[0] for v in validation] validation_costs = [v[1] for v in validation] # class accuracies correct = numpy.zeros((layers[0].n_classes), dtype='int32') total = numpy.zeros((layers[0].n_classes), dtype='int32') for v in validation: correct += v[2] total += v[3] validation_class_accuracy = calc_class_accuracy(correct, total) this_validation_loss = numpy.mean(validation_losses) logger.info( 'epoch %i, minibatch %i/%i, validation error %f %%', epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.) logger.info('validation cost: %f', numpy.mean(validation_costs)) logger.info('mean class accuracy: %f %%', validation_class_accuracy * 100.) # if we got the best validation score until now if this_validation_loss < best_validation_loss: # improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max( patience, 10 * n_train_batches + int(iter * patience_increase + 1)) logger.info("Patience increased to %d epochs", int(patience / n_train_batches)) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # save model parameters best_params = [l.get_weights() for l in layers] try_pickle_dump(best_params, weights_filename) logger.info((' epoch %i, minibatch %i/%i,' 'validation error of best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) if patience <= iter: logger.info("Ran out of patience") done_looping = True break train_cost = numpy.mean(training_costs) logger.info('Average training cost %f', train_cost) if train_cost < best_train_cost * improvement_threshold: best_train_cost = train_cost best_epoch = epoch # lower learning rate if no improvement learn_rate = l_rate_wrapper.learning_rate.get_value() if learn_rate > min_learning_rate and\ (epoch - best_epoch + 1) % epochs_check_learn_rate == 0: l_rate_wrapper.lower_rate_by_factor(0.5) logger.info('Optimization complete.') return best_validation_loss, best_iter, best_params
def eval_model(conf, train_fn, test_fn, n_train_batches, n_test_batches, layers, pre_fn=None, l_rate_wrapper=None): """ Function for trainig and validating models n_epochs: dictionary configuration params train_fn: theano function training function test_fn: theano function validation function n_train_batches: int number of batches for training n_test_batches: int number of batches for validation layers: list list of layers, used to extract best params pre_fn: function function to be called before training l_rate_wrapper: UpdateParameters object learning rate wrapper object returns: (best_validation_error, best_iter, best_params) the best validation error, iteration and parameters """ assert(type(conf) is dict) n_epochs = conf['epochs'] if n_epochs < 0: n_epochs = maxint # how often to lower learning rate if no improvement epochs_check_learn_rate = None if 'learning-rate-decrease-params' in conf: lrdp_params = conf['learning-rate-decrease-params'] epochs_check_learn_rate = lrdp_params['no-improvement-epochs'] min_learning_rate = lrdp_params['min-learning-rate'] # file for dumping weights now = datetime.now() weights_filename = "network-%d-%d.bin" % (now.hour, now.minute) logger.info('... training') # early-stopping parameters # look as this many iterations regardless patience = n_train_batches * 20 # skip first 20 epochs # wait this much longer when a new best is found patience_increase = 1.1 # a relative improvement of this much is considered significant improvement_threshold = 0.998 # go through this many minibatche before checking the network # on the validation set; in this case we check every epoch validation_frequency = min(n_train_batches, patience / 2) logger.debug('Validation frequency is %d' % validation_frequency) set_layers_training_mode(layers, 1) best_validation_loss = numpy.inf best_iter = 0 best_epoch = 0 # best epoch for train cost best_params = [] best_train_cost = numpy.inf epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 # function to be called before every epoch if pre_fn is not None: pre_fn() training_costs = numpy.zeros((n_train_batches), dtype='float32') for minibatch_index in xrange(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index cost_ij = train_fn(minibatch_index) training_costs[minibatch_index] = cost_ij # logger.info('training @ iter = %d, cost %f' % (iter, cost_ij)) stdout.write('.') stdout.flush() if (iter + 1) % validation_frequency == 0: stdout.write('\n') # newline after iteration dots set_layers_training_mode(layers, 0) # compute zero-one loss on validation set validation = [test_fn(i) for i in xrange(n_test_batches)] set_layers_training_mode(layers, 1) validation_losses = [v[0] for v in validation] validation_costs = [v[1] for v in validation] # class accuracies correct = numpy.zeros((layers[0].n_classes), dtype='int32') total = numpy.zeros((layers[0].n_classes), dtype='int32') for v in validation: correct += v[2] total += v[3] validation_class_accuracy = calc_class_accuracy(correct, total) this_validation_loss = numpy.mean(validation_losses) logger.info('epoch %i, minibatch %i/%i, validation error %f %%', epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.) logger.info('validation cost: %f', numpy.mean(validation_costs)) logger.info('mean class accuracy: %f %%', validation_class_accuracy * 100.) # if we got the best validation score until now if this_validation_loss < best_validation_loss: # improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, 10 * n_train_batches + int(iter * patience_increase + 1)) logger.info("Patience increased to %d epochs", int(patience / n_train_batches)) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # save model parameters best_params = [l.get_weights() for l in layers] try_pickle_dump(best_params, weights_filename) logger.info((' epoch %i, minibatch %i/%i,' 'validation error of best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) if patience <= iter: logger.info("Ran out of patience") done_looping = True break train_cost = numpy.mean(training_costs) logger.info('Average training cost %f', train_cost) if train_cost < best_train_cost * improvement_threshold: best_train_cost = train_cost best_epoch = epoch # lower learning rate if no improvement learn_rate = l_rate_wrapper.learning_rate.get_value() if learn_rate > min_learning_rate and\ (epoch - best_epoch + 1) % epochs_check_learn_rate == 0: l_rate_wrapper.lower_rate_by_factor(0.5) logger.info('Optimization complete.') return best_validation_loss, best_iter, best_params
def load_ngrams(n, features_use, tree, subset=None, min_occ=1, min_files=1, remove_subst_tokens=False): """ Loads the dataset for microsot sentence completion challenge, processed into ngrams. The raw dataset is loadaed and processed using the 'load' function, to which 'subset', 'min_occ' and 'min_files' are forwared. The resulting dataset is then processed into ngrams using the 'ngrams' function, to which 'n' and 'tree' parameter are forwarded. This is then cached on the disk for subsequent usage. The resulting ngrams are pruned from unwanted features as indicated by the 'features_use parameter'. Returns a tuple (sents, q_groups, answers, feature_sizes). This reflects the returned value by the 'load' function, except that 'sents' and 'g_groups' are now not just features extracted from text, but ngrams built from those features. """ features_use = np.array(features_use, dtype=bool) log.info("Loading %d-grams, %s, features_use: %s", n, "tree" if tree else "linear", "".join([str(int(i)) for i in features_use])) dir = os.path.join("data", "processed") if not os.path.exists(dir): os.makedirs(dir) name_base = "%s-%d_grams-subset_%r-min_occ_%r-min_files_%r" % ( "tree" if tree else "linear", n, subset, min_occ, min_files) # tree-grams can all be seen as a feature-subset of 4 grams if tree and n < 4: ngrams_all = load_ngrams(4, np.ones(features_use.size, dtype=bool), tree, subset, min_occ, min_files, remove_subst_tokens) else: # look for the cached 4-grams with all the features file_name = os.path.join(dir, name_base + ".pkl") ngrams_all = util.try_pickle_load(file_name) # it is possible that sentences are split # in order to avoid Python bug with storing large arrays if ngrams_all is not None and isinstance(ngrams_all[0], list): sents = np.vstack(ngrams_all[0]) ngrams_all = (sents, ) + ngrams_all[1:] # if unable to load cached data, create it if ngrams_all is None: # load data tokens, q_groups, answers, ftr_sizes = load(subset, min_occ, min_files) # tokens that should be present in ngrams # the purpose is to remove ngrams containing tokens that are # substitutes for removed ones invalid_tokens = None if remove_subst_tokens: invalid_tokens = dict(zip(range(3), ftr_sizes[:3] - 1)) log.info("Invalid tokens: %r", invalid_tokens) # define a function for generating ngrams, and process # trainset and questions _ngrams = lambda tokens: ngrams(n, tree, tokens, invalid_tokens) sent_ngrams = _ngrams(tokens) q_ngrams = [map(_ngrams, qg) for qg in q_groups] # store the processed data for subsequent usage # split sent ngrams to avoid Py bug with pickling large arrays util.try_pickle_dump( (np.vsplit(sent_ngrams, np.arange(1, 10) * (len(sent_ngrams) / 10)), q_ngrams, answers, ftr_sizes), file_name) ngrams_all = (sent_ngrams, q_ngrams, answers, ftr_sizes) # remove unwanted features from ngrams_all used_ftr = np.arange(ngrams_all[0].shape[1])[np.tile(features_use, n)] sents = ngrams_all[0][:, used_ftr] q_groups = [[q[:, used_ftr] for q in qg] for qg in ngrams_all[1]] return (sents, q_groups) + ngrams_all[2:]
def main(conf, gen_func, n_layers, show=False): """ conf: dictionary configuration dictionary, from json file gen_func: function function used for generating inputs to network n_layers: int number of layers of laplacian pyramid used as input show: bool if true, few parsed images will be shown as a result """ logger.info("... loading data") logger.debug("Theano.config.floatX is %s" % theano.config.floatX) # samples is list of Sample objects dataset_path = conf["training"]["dataset-folder"] samples = load_dataset(dataset_path) samples = list(samples) if "data-subset" in conf["training"]: # use only subset of data data_to_use = conf["training"]["data-subset"] logger.info("Using only subset of %d samples", data_to_use) samples = samples[:data_to_use] random.seed(conf["training"]["shuffle-seed"]) random.shuffle(samples) out_folder = conf["training"]["out-folder"] # if test data defined if "test-percent" in conf["training"]: logger.info("Found test configuration, generating test data") test_size = float(conf["training"]["test-percent"]) / 100.0 samples, test_samples = split_samples(samples, test_size) write_samples_log(test_samples, os.path.join(out_folder, "samples_test.log")) x_test = generate_x(test_samples, n_layers, gen_func) y_test = generate_targets(test_samples) try_pickle_dump(x_test, os.path.join(out_folder, "x_test.bin")) try_pickle_dump(y_test, os.path.join(out_folder, "y_test.bin")) else: logger.info("No test set configuration present") validation_size = float(conf["training"]["validation-percent"]) / 100.0 train_samples, validation_samples = split_samples(samples, validation_size) del samples write_samples_log(train_samples, os.path.join(out_folder, "samples_train.log")) write_samples_log(validation_samples, os.path.join(out_folder, "samples_validation.log")) x_train = generate_x(train_samples, n_layers, gen_func) x_validation = generate_x(validation_samples, n_layers, gen_func) y_train = generate_targets(train_samples) y_validation = generate_targets(validation_samples) del train_samples del validation_samples try_pickle_dump(x_train, os.path.join(out_folder, "x_train.bin")) try_pickle_dump(x_validation, os.path.join(out_folder, "x_validation.bin")) try_pickle_dump(y_train, os.path.join(out_folder, "y_train.bin")) try_pickle_dump(y_validation, os.path.join(out_folder, "y_validation.bin")) if show: # show few parsed samples from train set n_imgs = 5 for j in xrange(n_imgs): pylab.subplot(2, n_imgs, 0 * n_imgs + j + 1) pylab.axis("off") pylab.imshow(x_train[0][j, 0, :, :]) # rgb for j in xrange(n_imgs): pylab.subplot(2, n_imgs, 1 * n_imgs + j + 1) pylab.gray() pylab.axis("off") pylab.imshow(y_train[j, :, :]) pylab.show()