Exemple #1
0
    def predict(self, model):
        """
        Given a model, generate sentences from this dataset.

        Args:
            model (Model): Image captioning model.

        Returns:
            list, list containing predicted sentences and target sentences
        """
        sents = []
        targets = []
        y = self.be.zeros(self.dev_X.shape)
        for mb_idx, (x, t) in enumerate(self):
            y.fill(0)
            # Repeatedly generate next word in sentence and choose max prob word each time.
            for step in range(1, self.max_sentence_length + 1):
                prob = model.fprop(
                    (x[0], y), inference=True).get()[:, :-self.be.bsz].copy()
                pred = np.argmax(prob, axis=0)
                prob.fill(0)
                for i in range(step * self.be.bsz):
                    prob[pred[i], i] = 1
                y[:] = prob
            sents += self.prob_to_word(y)
            # Test set, keep list of targets
            if isinstance(self, ImageCaptionTest):
                targets += t[0]
            # Train set, only 1 target
            else:
                targets.append(t[0])

        return sents, targets
Exemple #2
0
    def trunc_bprop_tt(self, debug, numgrad=None):
        """
        TODO: move the loop over t into the layer class.
        """
        if numgrad is None:
            min_unroll = 1
        else:
            logger.debug("MLP.bprop single unrolling for numgrad")
            min_unroll = self.unrolls

        for tau in range(min_unroll-0, self.unrolls+1):
            self.cost_layer.cost.set_outputbuf(
                self.class_layer.output_list[tau-1])
            self.cost_layer.bprop(None, tau-1)
            if debug:
                tmp = self.cost_layer.targets[tau-1].asnumpyarray()
                tmp = tmp.argmax(0)[0]
                logger.debug("in RNNB.bprop, tau %d target %d" % (tau-1, tmp))
            error = self.cost_layer.deltas
            self.class_layer.bprop(error, tau, numgrad=numgrad)
            error = self.class_layer.deltas
            for t in list(range(0, tau))[::-1]:
                if 'c_t' in self.rec_layer.__dict__:
                    cerror = self.rec_layer.celtas  # on t=0, prev batch state
                else:
                    cerror = None  # for normal RNN
                self.rec_layer.bprop(error, cerror, t, numgrad=numgrad)
                error[:] = self.rec_layer.deltas  # [TODO] why need deepcopy?
Exemple #3
0
    def prob_to_word(self, prob):
        """
        Convert 1 hot probabilities to sentences.

        Args:
            prob (Tensor): Word probabilities of each sentence of batch.
                           Of size (vocab_size, batch_size * (max_sentence_length+1))

        Returns:
            list containing sentences
        """

        sents = []

        if not isinstance(prob, np.ndarray):
            prob = prob.get()
        words = [
            self.index_to_vocab[x] for x in np.argmax(prob, axis=0).tolist()
        ]

        for sent_index in range(self.be.bsz):
            sent = []
            for i in range(self.max_sentence_length):
                word = words[self.be.bsz * i + sent_index]
                sent.append(word)
                if (i > 0 and word == self.end_token) or i >= 20:
                    break
            sents.append(" ".join(sent))

        return sents
Exemple #4
0
    def get_ranks(self, values):
        """
        Computes the rank of the list of values passed from lowest to highest.
        Note that ties are given equal ranking value (the average of their
        positions)

        Arguments:
            values (list): The list of numeric values to be ranked.

        Returns:
            list: Same length as values with the positional rank of each
                  original value (1-based).
        """
        num_vals = len(values)
        srt_vals = sorted(zip(values, list(range(num_vals))))
        ranks = [0 for i in values]
        val = srt_vals[0][0]
        high_rank = 0
        for i in range(num_vals):
            if val != srt_vals[i][0]:
                val = srt_vals[i][0]
                for j in range(high_rank, i):
                    ranks[srt_vals[j][1]] = float(high_rank + i + 1) / 2.0
                high_rank = i
            if i == (num_vals - 1):
                for j in range(high_rank, i + 1):
                    ranks[srt_vals[j][1]] = float(high_rank + i + 2) / 2.0
        return ranks
Exemple #5
0
def display_text(index_to_token, gt, pr):
    """
    Print out some example strings of input - output pairs.
    """
    index_to_token[0] = '|'  # remove actual line breaks

    display_len = 3 * time_steps

    # sample 3 sentences and their start and end time steps
    (s1_s, s1_e) = (0, time_steps)
    (s2_s, s2_e) = (time_steps, 2 * time_steps)
    (s3_s, s3_e) = (2 * time_steps, 3 * time_steps)

    gt_string = "".join([index_to_token[gt[k]] for k in range(display_len)])
    pr_string = "".join([index_to_token[pr[k]] for k in range(display_len)])

    match = np.where(
        [gt_string[k] == pr_string[k] for k in range(display_len)])

    di_string = "".join(
        [gt_string[k] if k in match[0] else '.' for k in range(display_len)])

    neon_logger.display('GT:   [' + gt_string[s1_s:s1_e] + '] '
                        '[' + gt_string[s2_s:s2_e] + '] '
                        '[' + gt_string[s3_s:s3_e] + '] ')

    neon_logger.display('Pred: [' + pr_string[s1_s:s1_e] + '] '
                        '[' + pr_string[s2_s:s2_e] + '] '
                        '[' + pr_string[s3_s:s3_e] + '] ')

    neon_logger.display('Difference indicated by .')
    neon_logger.display('Diff: [' + di_string[s1_s:s1_e] + '] '
                        '[' + di_string[s2_s:s2_e] + '] '
                        '[' + di_string[s3_s:s3_e] + '] ')
Exemple #6
0
    def trunc_bprop_tt(self, debug, numgrad=None):
        """
        TODO: move the loop over t into the layer class.
        """
        if numgrad is None:
            min_unroll = 1
        else:
            logger.debug("MLP.bprop single unrolling for numgrad")
            min_unroll = self.unrolls

        for tau in range(min_unroll-0, self.unrolls+1):
            self.cost_layer.cost.set_outputbuf(
                self.class_layer.output_list[tau-1])
            self.cost_layer.bprop(None, tau-1)
            if debug:
                tmp = self.cost_layer.targets[tau-1].asnumpyarray()
                tmp = tmp.argmax(0)[0]
                logger.debug("in RNNB.bprop, tau %d target %d" % (tau-1, tmp))
            error = self.cost_layer.deltas
            self.class_layer.bprop(error, tau, numgrad=numgrad)
            error = self.class_layer.deltas
            for t in list(range(0, tau))[::-1]:
                if 'c_t' in self.rec_layer.__dict__:
                    cerror = self.rec_layer.celtas  # on t=0, prev batch state
                else:
                    cerror = None  # for normal RNN
                self.rec_layer.bprop(error, cerror, t, numgrad=numgrad)
                error[:] = self.rec_layer.deltas  # [TODO] why need deepcopy?
Exemple #7
0
def load_dataset(basepath, datadir, shuffle):
    path = os.path.join(basepath, datadir)
    if not os.path.exists(path):
        process(basepath)
    subdirs = glob(os.path.join(path, '*'))
    labelnames = sorted([os.path.basename(x) for x in subdirs])
    inds = list(range(len(labelnames)))
    labeldict = {key: val for key, val in zip(labelnames, inds)}
    lines = []
    for subdir in subdirs:
        subdirlabel = labeldict[os.path.basename(subdir)]
        files = glob(os.path.join(subdir, '*.png'))
        lines += [(filename, subdirlabel) for filename in files]
    assert (len(lines) > 0)
    data = None
    if shuffle:
        np.random.seed(0)
        np.random.shuffle(lines)
    for idx in range(len(lines)):
        # Convert from RGB to BGR to be consistent with the generic data loader
        im = np.asarray(Image.open(lines[idx][0]))[:, :, ::-1]
        im = np.transpose(im, axes=[2, 0, 1]).ravel()
        if data is None:
            data = np.empty((len(lines), im.shape[0]), dtype='float32')
            labels = np.empty((len(lines), 1), dtype='int32')
        data[idx] = im
        labels[idx] = lines[idx][1]
    data_view = data.reshape((data.shape[0], 3, -1))
    # Subtract mean values of B, G, R
    data_view -= np.array((127, 119, 104)).reshape((1, 3, 1))
    return (data, labels)
Exemple #8
0
    def write_batches(self,
                      name,
                      start,
                      labels,
                      imfiles,
                      targets=None,
                      is_tar=False):
        pool = Pool(processes=self.num_workers)
        psz = self.batch_size
        osz = self.output_image_size
        npts = (len(imfiles) + psz - 1) / psz

        imfiles = [imfiles[i * psz:(i + 1) * psz] for i in range(npts)]

        if targets is not None:
            targets = [
                targets[i * psz:(i + 1) * psz].T.copy() for i in range(npts)
            ]

        labels = [{k: v[i * psz:(i + 1) * psz]
                   for k, v in labels.iteritems()} for i in range(npts)]

        accum_buf = np.zeros((osz, osz, self.num_channels), dtype=np.int32)
        batch_mean = np.zeros(accum_buf.shape, dtype=np.uint8)
        logger.info("Writing %s batches...", name)
        for i, jpeg_file_batch in enumerate(imfiles):
            t = time()
            if is_tar:
                jpeg_file_batch = [j.read() for j in jpeg_file_batch]
            jpeg_strings = pool.map(
                functools.partial(proc_img, is_string=is_tar), jpeg_file_batch)
            targets_batch = None if targets is None else targets[i]
            labels_batch = labels[i]
            bfile = os.path.join(self.out_dir, 'data_batch_%d' % (start + i))
            serialize(
                {
                    'data': jpeg_strings,
                    'labels': labels_batch,
                    'targets': targets_batch
                }, bfile)
            logger.info("Wrote to %s (%s batch %d of %d) (%.2f sec)",
                        self.out_dir, name, i + 1, len(imfiles),
                        time() - t)

            # get the means and accumulate
            imgworker.calc_batch_mean(jpglist=jpeg_strings,
                                      tgt=batch_mean,
                                      orig_size=osz,
                                      rgb=self.rgb,
                                      nthreads=self.num_workers)

            # scale for the case where we have an undersized batch
            if len(jpeg_strings) < self.batch_size:
                batch_mean *= len(jpeg_strings) / self.batch_size
            accum_buf += batch_mean
        pool.close()
        mean_buf = self.train_mean if name == 'train' else self.val_mean
        mean_buf[:] = accum_buf / len(imfiles)
Exemple #9
0
    def load(self, backend=None, experiment=None):
        if self.inputs['train'] is not None:
            return
        if 'repo_path' in self.__dict__:
            self.repo_path = os.path.expandvars(os.path.expanduser(
                self.repo_path))
            save_dir = os.path.join(self.repo_path,
                                    self.__class__.__name__)
            if not os.path.exists(save_dir):
                os.makedirs(save_dir)

            for url in (self.raw_train_input_gz, self.raw_train_target_gz,
                        self.raw_test_input_gz, self.raw_test_target_gz):
                name = os.path.basename(url).rstrip('.gz')
                repo_gz_file = os.path.join(save_dir, name + '.gz')
                repo_file = repo_gz_file.rstrip('.gz')
                if not os.path.exists(repo_file):
                    self.download_to_repo(url, save_dir)
                    with gzip.open(repo_gz_file, 'rb') as infile:
                        with open(repo_file, 'w') as outfile:
                            for line in infile:
                                outfile.write(line)
                logger.info('loading: %s', name)
                if 'images' in repo_file and 'train' in repo_file:
                    indat = self.read_image_file(repo_file, 'float32')
                    # flatten to 1D images
                    self.inputs['train'] = indat
                elif 'images' in repo_file and 't10k' in repo_file:
                    indat = self.read_image_file(repo_file, 'float32')
                    self.inputs['test'] = indat[0:self.num_test_sample]
                elif 'labels' in repo_file and 'train' in repo_file:
                    indat = self.read_label_file(repo_file)
                    # Prep a 1-hot label encoding
                    tmp = np.zeros((indat.shape[0], 10), dtype=np.float32)
                    for col in range(10):
                        tmp[:, col] = indat == col
                    self.targets['train'] = tmp
                elif 'labels' in repo_file and 't10k' in repo_file:
                    indat = self.read_label_file(
                        repo_file)[0:self.num_test_sample]
                    tmp = np.zeros((self.num_test_sample, 10),
                                   dtype=np.float32)
                    for col in range(10):
                        tmp[:, col] = indat == col
                    self.targets['test'] = tmp
                else:
                    logger.error('problems loading: %s', name)
            if 'sample_pct' in self.__dict__:
                self.sample_training_data()
            if hasattr(self, 'validation_pct'):
                self.split_set(
                    self.validation_pct, from_set='train', to_set='validation')
            self.format()
        else:
            raise AttributeError('repo_path not specified in config')
Exemple #10
0
    def load(self, backend=None, experiment=None):
        self.initialize()
        if self.inputs['train'] is not None:
            return
        if 'repo_path' in self.__dict__:
            self.repo_path = os.path.expandvars(
                os.path.expanduser(self.repo_path))
            save_dir = os.path.join(self.repo_path, self.__class__.__name__)
            if not os.path.exists(save_dir):
                os.makedirs(save_dir)
            train_idcs = list(range(1000000))  # 1M letters out of 1.23M
            test_idcs = range(1000000, 1010000)
            if 'sample_pct' in self.__dict__:
                if self.sample_pct >= 1.0:
                    self.sample_pct /= 100.0
                    logger.info('sampling pct: %0.2f' % self.sample_pct)
                if self.sample_pct < 1.0:
                    # numpy.random.shuffle(train_idcs)
                    pass
                train_idcs = train_idcs[0:int(1000000 * self.sample_pct)]
            url = self.raw_base_url
            name = os.path.basename(url).rstrip('.txt')
            repo_file = os.path.join(save_dir, name + '.txt')
            if not os.path.exists(repo_file):
                self.download_to_repo(url, save_dir)
            logger.info('loading: %s' % name)
            indat = self.read_txt_file(repo_file)
            self.preinputs = dict()
            self.preinputs['train'] = indat[:, train_idcs]
            self.preinputs['test'] = indat[:, test_idcs]

            for dataset in ('train', 'test'):
                num_batches = self.preinputs[dataset].shape[1] / self.batch_size
                idx_list = numpy.arange(num_batches * self.batch_size)
                idx_list = idx_list.reshape(self.batch_size, num_batches)
                splay_3d = self.preinputs[dataset][:, idx_list.T]
                splay_3d = numpy.transpose(splay_3d, (1, 0, 2))
                splay_3d = splay_3d.reshape(-1, self.batch_size)
                self.inputs[dataset] = splay_3d
                offbyone = numpy.zeros(splay_3d.shape)
                length = offbyone.shape[0]
                offbyone[0:length -
                         self.data_dim, :] = splay_3d[self.data_dim:length, :]
                self.targets[dataset] = offbyone
            if hasattr(self, 'validation_pct'):
                self.split_set(self.validation_pct,
                               from_set='train',
                               to_set='validation')
            self.format(dtype=self.backend_type)  # runs transpose_batches

        else:
            raise AttributeError('repo_path not specified in config')
Exemple #11
0
    def load(self, backend=None, experiment=None):
        self.initialize()
        if self.inputs['train'] is not None:
            return
        if 'repo_path' in self.__dict__:
            self.repo_path = os.path.expandvars(os.path.expanduser(
                                                self.repo_path))
            save_dir = os.path.join(self.repo_path,
                                    self.__class__.__name__)
            if not os.path.exists(save_dir):
                os.makedirs(save_dir)
            train_idcs = list(range(1000000))  # 1M letters out of 1.23M
            test_idcs = range(1000000, 1010000)
            if 'sample_pct' in self.__dict__:
                if self.sample_pct >= 1.0:
                    self.sample_pct /= 100.0
                    logger.info('sampling pct: %0.2f' % self.sample_pct)
                if self.sample_pct < 1.0:
                    # numpy.random.shuffle(train_idcs)
                    pass
                train_idcs = train_idcs[0:int(1000000 * self.sample_pct)]
            url = self.raw_base_url
            name = os.path.basename(url).rstrip('.txt')
            repo_file = os.path.join(save_dir, name + '.txt')
            if not os.path.exists(repo_file):
                self.download_to_repo(url, save_dir)
            logger.info('loading: %s' % name)
            indat = self.read_txt_file(repo_file)
            self.preinputs = dict()
            self.preinputs['train'] = indat[:, train_idcs]
            self.preinputs['test'] = indat[:, test_idcs]

            for dataset in ('train', 'test'):
                num_batches = self.preinputs[dataset].shape[1]/self.batch_size
                idx_list = numpy.arange(num_batches * self.batch_size)
                idx_list = idx_list.reshape(self.batch_size, num_batches)
                splay_3d = self.preinputs[dataset][:, idx_list.T]
                splay_3d = numpy.transpose(splay_3d, (1, 0, 2))
                splay_3d = splay_3d.reshape(-1, self.batch_size)
                self.inputs[dataset] = splay_3d
                offbyone = numpy.zeros(splay_3d.shape)
                length = offbyone.shape[0]
                offbyone[0:length - self.data_dim, :] = splay_3d[self.data_dim:
                                                                 length, :]
                self.targets[dataset] = offbyone
            if hasattr(self, 'validation_pct'):
                self.split_set(
                    self.validation_pct, from_set='train', to_set='validation')
            self.format(dtype=self.backend_type)  # runs transpose_batches

        else:
            raise AttributeError('repo_path not specified in config')
Exemple #12
0
    def allocate_output_bufs(self):
        """ all the activations and temp buffers live here """
        super(RecurrentLSTMLayer, self).allocate_output_bufs()

        # things that are not initalized by the super class
        be = self.backend
        net_sze = (self.nout, self.batch_size)  # tuple with activation size.

        # buffers for gate activation
        for a in ['i', 'f', 'o', 'g']:
            setattr(self, a + '_t',
                    [be.zeros(net_sze) for k in range(self.unrolls)])
            setattr(self, 'net_' + a,
                    [be.zeros(net_sze) for k in range(self.unrolls)])

        # outputs: pre-allocate for d{i,f,o,c}_dh1
        self.d_dh1 = {
            gateid: be.zeros(net_sze)
            for gateid in ['i', 'f', 'o', 'c']
        }
        self.dc_d_dh1 = {
            gateid: be.zeros(net_sze)
            for gateid in ['i', 'f', 'c']
        }
        self.errs = {
            hcval: be.zeros(net_sze)
            for hcval in ['hh', 'hc', 'ch', 'cc']
        }
        self.gatedic = {}
        self.gatedic_u = {}

        # buffers for cell and output
        self.c_t = [be.zeros(net_sze) for k in range(self.unrolls)]
        self.c_phi = [be.zeros(net_sze) for k in range(self.unrolls)]
        self.c_phip = [be.zeros(net_sze) for k in range(self.unrolls)]
        self.output_list = [be.zeros(net_sze) for k in range(self.unrolls)]

        # pre-allocate preactivation buffers
        self.temp_x = [be.zeros(net_sze) for k in range(self.unrolls)]
        self.temp_h = [be.zeros(net_sze) for k in range(self.unrolls)]

        # pre-allocate derivative buffers
        self.dh_dwx_buf = be.zeros((self.nout, self.nin))
        self.dh_dwh_buf = be.zeros((self.nout, self.nout))

        self.delta_buf = be.zeros(net_sze)
        self.bsum_buf = be.zeros((self.nout, 1))

        # This quantity seems to be computed repeatedly
        # error_h * self.o_t[tau] * self.c_phip[tau]
        self.eh_ot_cphip = be.zeros(net_sze)

        # error buffers
        self.deltas = be.zeros((self.nout, self.batch_size))
        self.celtas = be.zeros((self.nout, self.batch_size))

        # temp buffer for numerical gradient
        self.temp_t = 0
Exemple #13
0
 def load_data(self, shape):
     data = np.random.uniform(low=0.0, high=1.0, size=shape)
     labels = np.random.randint(low=0, high=self.nout, size=shape[0])
     onehot = np.zeros((len(labels), self.nout), dtype='float32')
     for col in range(self.nout):
         onehot[:, col] = (labels == col)
     return (data, onehot)
Exemple #14
0
    def write_batches(self, offset, labels, imfiles):
        npts = -(-len(imfiles) // self.macro_size)
        starts = [i * self.macro_size for i in range(npts)]
        imfiles = [imfiles[s:s + self.macro_size] for s in starts]
        labels = [{k: v[s:s + self.macro_size]
                   for k, v in labels.iteritems()} for s in starts]

        for i, jpeg_file_batch in enumerate(imfiles):
            bfile = os.path.join(self.out_dir,
                                 '%s%d.cpio' % (self.batch_prefix, offset + i))
            label_batch = labels[i]['l_id']
            if os.path.exists(bfile):
                print("File %s exists, skipping..." % (bfile))
            else:
                self.write_individual_batch(bfile, label_batch,
                                            jpeg_file_batch)
                print("Wrote batch %d" % (i))

            # Check the batchfile for the max item value
            batch_max_item = self.writerlib.read_max_item(ct.c_char_p(bfile))
            if batch_max_item == 0:
                raise ValueError("Batch file %s probably empty or corrupt" %
                                 (bfile))

            self.item_max_size = max(batch_max_item, self.item_max_size)
Exemple #15
0
 def load_data(self, shape):
     data = np.random.uniform(low=0.0, high=1.0, size=shape)
     labels = np.random.randint(low=0, high=self.nout, size=shape[0])
     onehot = np.zeros((len(labels), self.nout), dtype='float32')
     for col in range(self.nout):
         onehot[:, col] = (labels == col)
     return (data, onehot)
Exemple #16
0
    def predict_fullset(self, dataset, setname):
        """
        Generate predicitons and true labels for the given dataset.
        Note that this requires enough memory to house the predictions and
        labels for the entire dataset at one time (not recommended for large
        datasets, see predict_generator instead).

        Agruments:
            dataset: A neon dataset instance
            setname: Which set to compute predictions for (test, train, val)

        Returns:
            tuple: on each call will yield a 2-tuple of outputs and references.
                   The first item is the model probabilities for each class,
                   and the second item is either the one-hot or raw labels with
                   ground truth.

        See Also:
            predict_generator
        """
        self.data_layer.init_dataset(dataset)
        assert self.data_layer.has_set(setname)
        self.data_layer.use_set(setname, predict=True)
        self.data_layer.reset_counter()

        predlabels = self.backend.empty((1, self.batch_size))
        labels = self.backend.empty((1, self.batch_size))

        outputs_pred = self.backend.zeros((self.data_layer.num_batches *
                                           self.unrolls, self.batch_size))
        outputs_targ = self.backend.zeros((self.data_layer.num_batches *
                                           self.unrolls, self.batch_size))

        mb_id = 0
        self.data_layer.reset_counter()
        while self.data_layer.has_more_data():
            mb_id += 1
            self.reset(mb_id)
            self.fprop(debug=False)
            # time unrolling loop to disseminate fprop results
            for tau in range(self.unrolls):
                probs = self.class_layer.output_list[tau]
                targets = self.data_layer.targets[tau]
                self.backend.argmax(targets, axis=0, out=labels)
                self.backend.argmax(probs, axis=0, out=predlabels)

                # collect batches to re-assemble continuous data
                idx = self.unrolls * (mb_id - 1) + tau
                outputs_pred[idx, :] = predlabels
                outputs_targ[idx, :] = labels

        self.data_layer.cleanup()

        # flatten the 2d predictions into our canonical 1D format
        pred_flat = outputs_pred.transpose().reshape((1, -1))
        targ_flat = outputs_targ.transpose().reshape((1, -1))

        self.write_string(pred_flat, targ_flat, setname)

        return (pred_flat, targ_flat)
    def extract_images(self, overwrite=False):
        from neon.data import load_cifar10
        from PIL import Image
        dataset = dict()
        dataset['train'], dataset['val'], _ = load_cifar10(self.out_dir, normalize=False)

        for setn in ('train', 'val'):
            data, labels = dataset[setn]

            img_dir = os.path.join(self.out_dir, setn)
            ulabels = np.unique(labels)
            for ulabel in ulabels:
                subdir = os.path.join(img_dir, str(ulabel))
                if not os.path.exists(subdir):
                    os.makedirs(subdir)

            for idx in range(data.shape[0]):
                im = np.pad(data[idx].reshape((3, 32, 32)), self.pad_width, mode='mean')
                im = np.uint8(np.transpose(im, axes=[1, 2, 0]).copy())
                im = Image.fromarray(im)
                path = os.path.join(img_dir, str(labels[idx][0]), str(idx) + '.png')
                im.save(path, format='PNG')

            if setn == 'train':
                self.pixel_mean = list(data.mean(axis=0).reshape(3, -1).mean(axis=1))
                self.pixel_mean.reverse()  # We will see this in BGR order b/c of opencv
Exemple #18
0
    def link_local(self):
        req_param(self, ['nifm', 'ifmshape', 'fshape'])

        opt_param(self, ['ofmlocs', 'links'])
        opt_param(self, ['deltasbuf', 'outputbuf'])

        opt_param(self, ['nofm'], self.nifm)
        opt_param(self, ['pooling'], False)
        opt_param(self, ['stride'], 1)
        opt_param(self, ['pad'], 0)

        assert len(self.ifmshape) == len(self.fshape)
        ofmshape = []
        for dim in range(len(self.ifmshape)):
            assert self.ifmshape[dim] >= self.fshape[dim]
            num = self.ifmshape[dim] - self.fshape[dim] + 2 * self.pad
            ofmshape.extend([num // self.stride + 1])
        self.ofmshape = tuple(ofmshape)
        self.negpad = -self.pad
        self.ifmsize = np.prod(self.ifmshape)
        self.ofmsize = np.prod(self.ofmshape)
        self.fpsize = np.prod(self.fshape)
        self.fsize = self.nifm * self.fpsize
        self.nout = self.nofm * self.ofmsize
        logger.debug('name=%s, nifm=%d, ifmshape=%s, ofmshape=%s',
                     self.name, self.nifm, self.ifmshape, self.ofmshape)
    def write_csv_files(self):
        # Get the labels as the subdirs
        subdirs = glob(os.path.join(self.image_dir, '*'))
        self.label_names = sorted([os.path.basename(x) for x in subdirs])

        indexes = range(len(self.label_names))
        self.label_dict = {k: v for k, v in zip(self.label_names, indexes)}

        tlines = []
        vlines = []
        for subdir in subdirs:
            subdir_label = self.label_dict[os.path.basename(subdir)]
            files = glob(os.path.join(subdir, self.file_pattern))
            if self.class_samples_max is not None:
                files = files[:self.class_samples_max]
            lines = [(filename, subdir_label) for filename in files]
            v_idx = int(self.validation_pct * len(lines))
            tlines += lines[v_idx:]
            vlines += lines[:v_idx]
        np.random.shuffle(tlines)

        if not os.path.exists(self.out_dir):
            os.makedirs(self.out_dir)

        for ff, ll in zip([self.train_file, self.val_file], [tlines, vlines]):
            with gzip.open(ff, 'wb') as f:
                f.write('filename,l_id\n')
                for tup in ll:
                    f.write('{},{}\n'.format(*tup))

        self.train_nrec = len(tlines)
        self.train_start = 0

        self.val_nrec = len(vlines)
        self.val_start = -(-self.train_nrec // self.macro_size)
Exemple #20
0
    def predict_generator(self, dataset, setname):
        """
        Generate flattened predicitons and true labels for the given dataset,
        one mini-batch at a time.

        Agruments:
            dataset: A neon dataset instance
            setname: Which set to compute predictions for (test, train, val)

        Returns:
            tuple: on each call will yield a 2-tuple of outputs and references.
                   The first item is the model probabilities for each class,
                   and the second item is either the one-hot or raw labels with
                   ground truth.

        See Also:
            predict_fullset
        """
        # TODO: find some alternate way of re-assembling data that doesn't
        # require allocating space for the entire dataset so we can avoid the
        # call to predict_fullset
        (pred_flat, targ_flat) = self.predict_fullset(dataset, setname)

        for i in range(self.data_layer.num_batches):
            start = i * self.unrolls * self.batch_size
            end = start + (self.unrolls * self.batch_size)
            yield (pred_flat[start:end], targ_flat[start:end])
Exemple #21
0
    def predict_fullset(self, dataset, setname):
        """
        Generate predicitons and true labels for the given dataset.
        Note that this requires enough memory to house the predictions and
        labels for the entire dataset at one time (not recommended for large
        datasets, see predict_generator instead).

        Agruments:
            dataset: A neon dataset instance
            setname: Which set to compute predictions for (test, train, val)

        Returns:
            tuple: on each call will yield a 2-tuple of outputs and references.
                   The first item is the model probabilities for each class,
                   and the second item is either the one-hot or raw labels with
                   ground truth.

        See Also:
            predict_generator
        """
        self.data_layer.init_dataset(dataset)
        assert self.data_layer.has_set(setname)
        self.data_layer.use_set(setname, predict=True)
        self.data_layer.reset_counter()

        predlabels = self.backend.empty((1, self.batch_size))
        labels = self.backend.empty((1, self.batch_size))

        outputs_pred = self.backend.zeros((self.data_layer.num_batches *
                                           self.unrolls, self.batch_size))
        outputs_targ = self.backend.zeros((self.data_layer.num_batches *
                                           self.unrolls, self.batch_size))

        mb_id = 0
        self.data_layer.reset_counter()
        while self.data_layer.has_more_data():
            mb_id += 1
            self.reset(mb_id)
            self.fprop(debug=False)
            # time unrolling loop to disseminate fprop results
            for tau in range(self.unrolls):
                probs = self.class_layer.output_list[tau]
                targets = self.data_layer.targets[tau]
                self.backend.argmax(targets, axis=0, out=labels)
                self.backend.argmax(probs, axis=0, out=predlabels)

                # collect batches to re-assemble continuous data
                idx = self.unrolls * (mb_id - 1) + tau
                outputs_pred[idx, :] = predlabels
                outputs_targ[idx, :] = labels

        self.data_layer.cleanup()

        # flatten the 2d predictions into our canonical 1D format
        pred_flat = outputs_pred.transpose().reshape((1, -1))
        targ_flat = outputs_targ.transpose().reshape((1, -1))

        self.write_string(pred_flat, targ_flat, setname)

        return (pred_flat, targ_flat)
    def write_csv_files(self):
        files = glob(os.path.join(self.image_dir, "*.jpg"))
        files.sort()
        if self.val_frac != 1.0:
            filemap, idmap, x1map, y1map, x2map, y2map = read_labels(
                self.image_dir, self.points1_file, self.points2_file, self.target_size
            )
        if self.id_label == 1:
            self.label_names = ["id"]
        else:
            self.label_names = ["x1", "y1", "x2", "y2"]

        indexes = range(len(self.label_names))
        self.label_dict = {k: v for k, v in zip(self.label_names, indexes)}

        tlines = []
        vlines = []

        np.random.shuffle(files)
        v_idx = int(self.val_frac * len(files))
        tfiles = files[v_idx:]
        vfiles = files[:v_idx]
        vfiles.sort()
        if self.id_label == 1:
            if self.val_frac == 1.0:
                vlines = [(f, 0) for f in vfiles]
            else:
                tlines = [(f, idmap[filemap[f]]) for f in tfiles]
        else:
            if self.val_frac == 1.0:
                vlines = [(f, 0, 0, 0, 0) for f in vfiles]
            else:
                tlines = [(f, x1map[f], y1map[f], x2map[f], y2map[f]) for f in tfiles]
        np.random.shuffle(tlines)

        if not os.path.exists(self.out_dir):
            os.makedirs(self.out_dir)

        for ff, ll in zip([self.train_file, self.val_file], [tlines, vlines]):
            with open(ff, "wb") as f:
                if self.id_label == 1:
                    f.write("filename,id\n")
                    for tup in ll:
                        f.write("{},{}\n".format(*tup))
                else:
                    f.write("filename,x,y\n")
                    for tup in ll:
                        f.write("{},{},{},{},{}\n".format(*tup))

        self.train_nrec = len(tlines)
        self.ntrain = -(-self.train_nrec // self.macro_size)
        self.train_start = 0

        self.val_nrec = len(vlines)
        self.nval = -(-self.val_nrec // self.macro_size)
        if self.ntrain == 0:
            self.val_start = 100
        else:
            self.val_start = 10 ** int(np.log10(self.ntrain * 10))
Exemple #23
0
    def allocate_output_bufs(self):
        make_zbuf = self.backend.zeros
        opt_param(self, ['out_shape'], (self.nout, self.batch_size))
        self.output = make_zbuf(self.out_shape, self.output_dtype)

        self.pre_act = self.activation.pre_act_buffer(self.backend,
                                                      self.output,
                                                      self.pre_act_dtype)

        # TODO: Get rid of output and pre_act. But they seem to be used in the
        # cost to set a buffer size.
        self.pre_act_list = [self.pre_act] + \
                            [make_zbuf(self.out_shape, self.pre_act_dtype)
                             for k in range(1, self.unrolls)]
        self.output_list = [self.output] + \
                           [make_zbuf(self.out_shape, self.output_dtype)
                            for k in range(1, self.unrolls)]
Exemple #24
0
    def write_batches(self, name, start, labels, imfiles, targets=None,
                      is_tar=False):
        pool = Pool(processes=self.num_workers)
        psz = self.batch_size
        osz = self.output_image_size
        npts = (len(imfiles) + psz - 1) // psz

        imfiles = [imfiles[i*psz: (i+1)*psz] for i in range(npts)]

        if targets is not None:
            targets = [targets[i*psz: (i+1)*psz].T.copy() for i in range(npts)]

        labels = [{k: v[i*psz: (i+1)*psz] for k, v in labels.iteritems()}
                  for i in range(npts)]

        accum_buf = np.zeros(self.train_mean.shape, dtype=np.int32)
        batch_mean = np.zeros(accum_buf.shape, dtype=np.uint8)
        logger.info("Writing %s batches...", name)
        for i, jpeg_file_batch in enumerate(imfiles):
            t = time()
            if is_tar:
                jpeg_file_batch = [j.read() for j in jpeg_file_batch]
            jpeg_strings = pool.map(
                functools.partial(proc_img, is_string=is_tar), jpeg_file_batch)
            targets_batch = None if targets is None else targets[i]
            labels_batch = labels[i]
            bfile = os.path.join(self.out_dir, 'data_batch_%d' % (start + i))
            serialize({'data': jpeg_strings,
                       'labels': labels_batch,
                       'targets': targets_batch},
                      bfile)
            logger.info("Wrote to %s (%s batch %d of %d) (%.2f sec)",
                        self.out_dir, name, i + 1, len(imfiles), time() - t)

            # get the means and accumulate
            imgworker.calc_batch_mean(jpglist=jpeg_strings, tgt=batch_mean,
                                      orig_size=osz, rgb=self.rgb,
                                      nthreads=self.num_workers)

            # scale for the case where we have an undersized batch
            if len(jpeg_strings) < self.batch_size:
                batch_mean *= len(jpeg_strings) / self.batch_size
            accum_buf += batch_mean
        pool.close()
        mean_buf = self.train_mean if name == 'train' else self.val_mean
        mean_buf[:] = accum_buf / len(imfiles)
Exemple #25
0
    def bleu_score(self, sents, targets):
        """
        Compute the BLEU score from a list of predicted sentences and reference sentences

        Args:
            sents (list): list of predicted sentences
            targets (list): list of reference sentences where each element is a list of
                            multiple references.
        """

        num_ref = len(targets[0])
        output_file = self.path + '/output'
        reference_files = [
            self.path + '/reference%d' % i for i in range(num_ref)
        ]
        bleu_script_url = 'https://raw.githubusercontent.com/karpathy/neuraltalk/master/eval/'
        bleu_script = 'multi-bleu.perl'

        neon_logger.display("Writing output and reference sents to dir %s" %
                            self.path)

        output_f = open(output_file, 'w+')
        for sent in sents:
            sent = sent.strip(self.end_token).split()
            output_f.write(" ".join(sent) + '\n')

        reference_f = [open(f, 'w') for f in reference_files]
        for i in range(num_ref):
            for target_sents in targets:
                reference_f[i].write(target_sents[i] + '\n')

        output_f.close()
        [x.close() for x in reference_f]

        owd = os.getcwd()
        os.chdir(self.path)
        if not os.path.exists(bleu_script):
            Dataset.fetch_dataset(bleu_script_url, bleu_script, bleu_script,
                                  6e6)
        bleu_command = 'perl multi-bleu.perl reference < output'
        neon_logger.display(
            "Executing bleu eval script: {}".format(bleu_command))
        os.system(bleu_command)
        os.chdir(owd)
Exemple #26
0
 def transpose_batches(self, data, dtype, is_target=False):
     """
     Transpose each minibatch within the dataset.
     """
     bs = self.data_dim * self.unrolls
     dd = self.data_dim
     if data.shape[0] % bs != 0:
         logger.warning('Incompatible batch size. '
                        'Discarding %d samples...',
                        data.shape[0] % bs)
     nbatches = data.shape[0] / bs
     batchwise = [[] for k in range(nbatches)]
     for batch in range(nbatches):
         batchdata = [self.backend.array(data[(batch * bs + k * dd):
                                              (batch * bs + (k + 1) *
                                               dd)], dtype)
                      for k in range(self.unrolls)]
         batchwise[batch] = batchdata
     return batchwise
Exemple #27
0
 def ellipse(self, canvas, xrad, yrad):
     rcanvas = canvas.reshape((self.nifm, self.ifmheight, self.ifmwidth))
     smooth = 10
     angs = np.linspace(0, 2 * np.pi, smooth * 360)
     si = np.sin(angs)
     co = np.cos(angs)
     xvals = np.int32(xrad * co) + self.center[0]
     yvals = np.int32(yrad * si) + self.center[1]
     for fm in range(self.nifm):
         rcanvas[fm, xvals, yvals] = np.random.randint(256)
Exemple #28
0
    def allocate_output_bufs(self):
        make_zbuf = self.backend.zeros
        super(RecurrentHiddenLayer, self).allocate_output_bufs()

        # these buffers are specific to RHL:
        # might want self.temp_in=temp_out, to save a buffer.
        self.temp_in = make_zbuf(self.weight_shape, self.weight_dtype)
        self.temp_rec = make_zbuf(self.weight_rec_shape)
        # Extra temp buffers z[0]=w*x and z[1]=w*input.
        self.z = [make_zbuf(self.out_shape) for k in range(2)]
Exemple #29
0
 def ellipse(self, canvas, xrad, yrad):
     rcanvas = canvas.reshape((self.nifm, self.ifmheight, self.ifmwidth))
     smooth = 10
     angs = np.linspace(0, 2 * np.pi, smooth * 360)
     si = np.sin(angs)
     co = np.cos(angs)
     xvals = np.int32(xrad * co) + self.center[0]
     yvals = np.int32(yrad * si) + self.center[1]
     for fm in range(self.nifm):
         rcanvas[fm, xvals, yvals] = np.random.randint(256)
Exemple #30
0
    def allocate_output_bufs(self):
        make_zbuf = self.backend.zeros
        super(RecurrentHiddenLayer, self).allocate_output_bufs()

        # these buffers are specific to RHL:
        # might want self.temp_in=temp_out, to save a buffer.
        self.temp_in = make_zbuf(self.weight_shape, self.weight_dtype)
        self.temp_rec = make_zbuf(self.weight_rec_shape)
        # Extra temp buffers z[0]=w*x and z[1]=w*input.
        self.z = [make_zbuf(self.out_shape) for k in range(2)]
Exemple #31
0
    def allocate_output_bufs(self):
        """ all the activations and temp buffers live here """
        super(RecurrentLSTMLayer, self).allocate_output_bufs()

        # things that are not initalized by the super class
        be = self.backend
        net_sze = (self.nout, self.batch_size)  # tuple with activation size.

        # buffers for gate activation
        for a in ['i', 'f', 'o', 'g']:
            setattr(self, a + '_t',
                    [be.zeros(net_sze) for k in range(self.unrolls)])
            setattr(self, 'net_' + a,
                    [be.zeros(net_sze) for k in range(self.unrolls)])

        # outputs: pre-allocate for d{i,f,o,c}_dh1
        self.d_dh1 = {gateid: be.zeros(net_sze) for
                      gateid in ['i', 'f', 'o', 'c']}
        self.dc_d_dh1 = {gateid: be.zeros(net_sze) for
                         gateid in ['i', 'f', 'c']}
        self.errs = {hcval: be.zeros(net_sze) for
                     hcval in ['hh', 'hc', 'ch', 'cc']}
        self.gatedic = {}
        self.gatedic_u = {}

        # buffers for cell and output
        self.c_t = [be.zeros(net_sze) for k in range(self.unrolls)]
        self.c_phi = [be.zeros(net_sze) for k in range(self.unrolls)]
        self.c_phip = [be.zeros(net_sze) for k in range(self.unrolls)]
        self.output_list = [be.zeros(net_sze) for k in range(self.unrolls)]

        # pre-allocate preactivation buffers
        self.temp_x = [be.zeros(net_sze) for k in range(self.unrolls)]
        self.temp_h = [be.zeros(net_sze) for k in range(self.unrolls)]

        # pre-allocate derivative buffers
        self.dh_dwx_buf = be.zeros((self.nout, self.nin))
        self.dh_dwh_buf = be.zeros((self.nout, self.nout))

        self.delta_buf = be.zeros(net_sze)
        self.bsum_buf = be.zeros((self.nout, 1))

        # This quantity seems to be computed repeatedly
        # error_h * self.o_t[tau] * self.c_phip[tau]
        self.eh_ot_cphip = be.zeros(net_sze)

        # error buffers
        self.deltas = be.zeros((self.nout, self.batch_size))
        self.celtas = be.zeros((self.nout, self.batch_size))

        # temp buffer for numerical gradient
        self.temp_t = 0
Exemple #32
0
    def fprop(self,
              debug=False,
              eps_tau=-1,
              eps=0,
              num_target=None,
              num_i=0,
              num_j=0):
        """
        Adding numerical gradient functionality here to avoid duplicate fprops.
        TODO: Make a version where the for tau loop is inside the layer. The
        best way is to have a baseclass for both RNN and LSTM for this.
        """
        self.data_layer.fprop(None)  # get next mini batch
        inputs = self.data_layer.output
        y = self.rec_layer.output_list  # note: just a shorthand, no copy.
        c = [None for k in range(len(y))]
        if 'c_t' in self.rec_layer.__dict__:
            c = self.rec_layer.c_t

        # loop for rec_layer
        for tau in range(0, self.unrolls):
            if tau == eps_tau:
                numpy_target = num_target[num_i, num_j].asnumpyarray()
                num_target[num_i, num_j] = (numpy_target + eps)
            if debug:
                logger.debug("in RNNB.fprop, tau %d, input %d" %
                             (tau, inputs[tau].asnumpyarray().argmax(0)[0]))
            self.rec_layer.fprop(y[tau - 1], c[tau - 1], inputs[tau], tau)
            if tau == eps_tau:
                num_target[num_i, num_j] = numpy_target

        # loop for class_layer
        for tau in range(0, self.unrolls):
            if tau == eps_tau:
                numpy_target = num_target[num_i, num_j].asnumpyarray()
                num_target[num_i, num_j] = (numpy_target + eps)
            if debug:
                logger.debug("in RNNB.fprop, tau %d, input %d" %
                             (tau, inputs[tau].asnumpyarray().argmax(0)[0]))
            self.class_layer.fprop(y[tau], tau)
            if tau == eps_tau:
                num_target[num_i, num_j] = numpy_target
Exemple #33
0
 def transpose_batches(self, data, dtype, is_target=False):
     """
     Transpose each minibatch within the dataset.
     """
     bs = self.data_dim * self.unrolls
     dd = self.data_dim
     if data.shape[0] % bs != 0:
         logger.warning(
             'Incompatible batch size. '
             'Discarding %d samples...', data.shape[0] % bs)
     nbatches = data.shape[0] / bs
     batchwise = [[] for k in range(nbatches)]
     for batch in range(nbatches):
         batchdata = [
             self.backend.array(
                 data[(batch * bs + k * dd):(batch * bs + (k + 1) * dd)],
                 dtype) for k in range(self.unrolls)
         ]
         batchwise[batch] = batchdata
     return batchwise
Exemple #34
0
    def initialize(self, kwargs):
        super(BranchLayer, self).initialize(kwargs)

        self.startidx = [0] * len(self.sublayers)
        self.endidx = [0] * len(self.sublayers)
        self.endidx[0] = self.sublayers[0].nout
        for i in range(1, len(self.sublayers)):
            self.endidx[i] = self.endidx[i - 1] + self.sublayers[i].nout
            self.startidx[i] = self.endidx[i - 1]

        self.allocate_output_bufs()
Exemple #35
0
    def initialize(self, kwargs):
        super(BranchLayer, self).initialize(kwargs)

        self.startidx = [0] * len(self.sublayers)
        self.endidx = [0] * len(self.sublayers)
        self.endidx[0] = self.sublayers[0].nout
        for i in range(1, len(self.sublayers)):
            self.endidx[i] = self.endidx[i - 1] + self.sublayers[i].nout
            self.startidx[i] = self.endidx[i - 1]

        self.allocate_output_bufs()
Exemple #36
0
    def fprop(self, debug=False, eps_tau=-1, eps=0,
              num=None):
        """
        Adding numerical gradient functionality here to avoid duplicate fprops.
        TODO: Make a version where the for tau loop is inside the layer. The
        best way is to have a baseclass for both RNN and LSTM for this.
        """
        self.data_layer.fprop(None)  # get next mini batch
        inputs = self.data_layer.output
        y = self.rec_layer.output_list  # note: just a shorthand, no copy.
        c = [None for k in range(len(y))]
        if 'c_t' in self.rec_layer.__dict__:
            c = self.rec_layer.c_t

        # loop for rec_layer
        for tau in range(0, self.unrolls):
            if num and num['target'] and (tau == eps_tau):
                # inject epsilon for numerical gradient
                numpy_target = num['target'][num['i'], num['j']].asnumpyarray()
                num['target'][num['i'], num['j']] = (numpy_target + eps)
            if debug:
                logger.debug("in RNNB.fprop, tau %d, input %s" % (tau,
                             inputs[tau].asnumpyarray().argmax(0)[0:5]))
            self.rec_layer.fprop(y[tau-1], c[tau-1], inputs[tau], tau)
            if num and num['target'] and (tau == eps_tau):
                # remove epsilon
                num['target'][num['i'], num['j']] = numpy_target

        # loop for class_layer
        for tau in range(0, self.unrolls):
            if num and num['target'] and (tau == eps_tau):
                # inject epsilon for numerical gradient
                numpy_target = num['target'][num['i'], num['j']].asnumpyarray()
                num['target'][num['i'], num['j']] = (numpy_target + eps)
            if debug:
                logger.debug("in RNNB.fprop, tau %d, input %s" % (tau,
                             inputs[tau].asnumpyarray().argmax(0)[0:5]))
            self.class_layer.fprop(y[tau], tau)
            if num and num['target'] and (tau == eps_tau):
                # remove epsilon
                num['target'][num['i'], num['j']] = numpy_target
Exemple #37
0
 def load(self, backend=None, experiment=None):
     """
     main function
     """
     import scipy.io
     if 'repo_path' in self.__dict__:
         self.repo_path = os.path.expandvars(
             os.path.expanduser(self.repo_path))
         save_dir = os.path.join(self.repo_path, self.__class__.__name__)
         if not os.path.exists(save_dir):
             os.makedirs(save_dir)
         train_idcs = list(range(10000))
         if 'sample_pct' in self.__dict__:
             if self.sample_pct > 1.0:
                 self.sample_pct /= 100.0
             if self.sample_pct < 1.0:
                 numpy.random.seed(self.backend.rng_seed)
                 numpy.random.shuffle(train_idcs)
             train_idcs = train_idcs[0:int(10000 * self.sample_pct)]
         for url in (self.raw_train_unwhitened, self.raw_train_whitened):
             name = os.path.basename(url).rstrip('.mat')
             repo_mat_file = os.path.join(save_dir, name + '.mat')
             repo_file = repo_mat_file.rstrip('.mat')
             # download and create dataset
             if not os.path.exists(repo_file):
                 self.download_to_repo(url, save_dir)
                 infile = scipy.io.loadmat(repo_mat_file)
                 with open(repo_file, 'wb') as outfile:
                     data = infile[infile.keys()[0]]
                     # patches are extracted so they can be cached
                     # doing non-overlapping 16x16 patches (1024 per image)
                     patches = data.reshape(512 / 16, 16, 512 / 16, 16, 10)
                     patches = patches.transpose(1, 3, 0, 2, 4)
                     patches = patches.reshape(16, 16, 1024 * 10)
                     logger.info("Caching to pickle file: %s", outfile)
                     pickle.dump(patches, outfile)
                     outfile.close()
             logger.info('loading: %s', name)
             # load existing data
             if 'IMAGES' in repo_file:
                 indat = self.read_image_file(repo_file, 'float32')
                 # flatten to 1D images
                 indat = indat.reshape((256, 10240)).transpose()[train_idcs]
                 self.inputs['train'] = indat
             else:
                 logger.error('problems loading: %s', name)
         if hasattr(self, 'validation_pct'):
             self.split_set(self.validation_pct,
                            from_set='train',
                            to_set='validation')
         self.format()
     else:
         raise AttributeError('repo_path not specified in config')
Exemple #38
0
    def load_file(self, filename, nclasses):
        logger.info("loading: %s", filename)
        dict = deserialize(filename)

        full_image = np.float32(dict["data"])
        full_image /= 255.0

        labels = np.array(dict["labels"])
        onehot = np.zeros((len(labels), nclasses), dtype="float32")
        for col in range(nclasses):
            onehot[:, col] = labels == col
        return (full_image, onehot)
Exemple #39
0
    def load_file(self, filename, nclasses):
        logger.info('loading: %s', filename)
        dict = deserialize(filename)

        full_image = np.float32(dict['data'])
        full_image /= 255.

        labels = np.array(dict['labels'])
        onehot = np.zeros((len(labels), nclasses), dtype='float32')
        for col in range(nclasses):
            onehot[:, col] = (labels == col)
        return (full_image, onehot)
Exemple #40
0
 def load(self, backend=None, experiment=None):
     """
     main function
     """
     import scipy.io
     if 'repo_path' in self.__dict__:
         self.repo_path = os.path.expandvars(os.path.expanduser(
                                             self.repo_path))
         save_dir = os.path.join(self.repo_path,
                                 self.__class__.__name__)
         if not os.path.exists(save_dir):
             os.makedirs(save_dir)
         train_idcs = list(range(10000))
         if 'sample_pct' in self.__dict__:
             if self.sample_pct > 1.0:
                 self.sample_pct /= 100.0
             if self.sample_pct < 1.0:
                 numpy.random.seed(self.backend.rng_seed)
                 numpy.random.shuffle(train_idcs)
             train_idcs = train_idcs[0:int(10000 * self.sample_pct)]
         for url in (self.raw_train_unwhitened, self.raw_train_whitened):
             name = os.path.basename(url).rstrip('.mat')
             repo_mat_file = os.path.join(save_dir, name + '.mat')
             repo_file = repo_mat_file.rstrip('.mat')
             # download and create dataset
             if not os.path.exists(repo_file):
                 self.download_to_repo(url, save_dir)
                 infile = scipy.io.loadmat(repo_mat_file)
                 with open(repo_file, 'wb') as outfile:
                     data = infile[infile.keys()[0]]
                     # patches are extracted so they can be cached
                     # doing non-overlapping 16x16 patches (1024 per image)
                     patches = data.reshape(512 / 16, 16, 512 / 16, 16, 10)
                     patches = patches.transpose(1, 3, 0, 2, 4)
                     patches = patches.reshape(16, 16, 1024 * 10)
                     logger.info("Caching to pickle file: %s", outfile)
                     pickle.dump(patches, outfile)
                     outfile.close()
             logger.info('loading: %s', name)
             # load existing data
             if 'IMAGES' in repo_file:
                 indat = self.read_image_file(repo_file, 'float32')
                 # flatten to 1D images
                 indat = indat.reshape((256, 10240)).transpose()[train_idcs]
                 self.inputs['train'] = indat
             else:
                 logger.error('problems loading: %s', name)
         if hasattr(self, 'validation_pct'):
             self.split_set(
                 self.validation_pct, from_set='train', to_set='validation')
         self.format()
     else:
         raise AttributeError('repo_path not specified in config')
Exemple #41
0
    def load_file(self, filename, nclasses):
        logger.info('loading: %s', filename)
        dict = deserialize(filename)

        full_image = np.float32(dict['data'])
        full_image /= 255.

        labels = np.array(dict['labels'])
        onehot = np.zeros((len(labels), nclasses), dtype='float32')
        for col in range(nclasses):
            onehot[:, col] = (labels == col)
        return (full_image, onehot)
Exemple #42
0
 def make_links(self, nifm, ifmsize, ifmshape, ofmshape, fshape, stride):
     # Figure out local connections to the previous layer.
     # This function works for any number of dimensions.
     ndims = len(ifmshape)
     dimsizes = np.empty(ndims, dtype='int32')
     for dim in range(ndims):
         dimsizes[dim] = np.prod(ifmshape[dim:])
     links = []
     for ofmdim in np.ndindex(ofmshape):
         # This variable tracks the top left corner of
         # the receptive field.
         src = ofmdim[-1]
         for dim in range(-1, -ndims, -1):
             src += dimsizes[dim] * ofmdim[dim - 1]
         src *= stride
         indlist = list(range(src, src + fshape[-1]))
         for dim in range(-1, -ndims, -1):
             indarray = np.array(indlist)
             for dimind in range(1, fshape[dim - 1]):
                 indlist.extend(list(indarray + dimind * dimsizes[dim]))
         if self.pooling is False:
             indarray = np.array(indlist)
             for ifm in range(1, nifm):
                 indlist.extend(list(indarray + ifm * ifmsize))
         links.append(indlist)
     self.links = np.array(links, dtype='int32')
Exemple #43
0
def process_dataset(data, labels, inputpath, leafdir):
    datadir = os.path.join(inputpath, leafdir)
    print('Saving images to %s' % datadir)
    os.mkdir(datadir)
    ulabels = np.unique(labels)
    for ulabel in ulabels:
        os.mkdir(os.path.join(datadir, str(ulabel)))
    for idx in range(data.shape[0]):
        im = data[idx].reshape((3, 32, 32))
        im = np.uint8(np.transpose(im, axes=[1, 2, 0]).copy())
        im = Image.fromarray(im)
        path = os.path.join(datadir, str(labels[idx][0]), str(idx) + '.png')
        im.save(path, format='PNG')
Exemple #44
0
    def transpose_batches(self, data, dtype, is_target=False):
        """
        Transpose and distribute each minibatch within a dataset.

        Arguments:
            data (ndarray): Dataset to be sliced into mini batches,
                            transposed, and loaded to appropriate device
                            memory.
        Returns:
            list: List of device loaded mini-batches of data.
        """
        bs = self.backend.batch_size
        sba = self.backend.array
        if data.shape[0] % bs != 0:
            logger.warning('Incompatible batch size. Discarding %d samples...',
                           data.shape[0] % bs)
        nbatches = data.shape[0] // bs
        batchwise = []
        if not self.backend.is_dist:
            batchwise = [sba(data[idx * bs:(idx + 1) * bs].transpose().copy())
                         for idx in range(nbatches)]
        else:
            batchwise = []
            if is_target or not self.fragment_data:
                devshape = (bs, data.shape[1])
                ptype = 'replica'
            else:
                devshape = (bs / self.backend.num_dev, data.shape[1])
                ptype = 'fragment'
            dev_batchdata_t = self.backend.empty(devshape)
            dev_batchdata_t.ptype = ptype
            for batch in range(nbatches):
                self.backend.set(dev_batchdata_t,
                                 data[batch * bs:(batch + 1) * bs])
                dev_batchdata = self.backend.empty(dev_batchdata_t.shape[::-1])
                dev_batchdata[:] = dev_batchdata_t.T
                dev_batchdata.ptype = dev_batchdata_t.ptype
                batchwise.append(dev_batchdata)
        return batchwise
Exemple #45
0
    def read_txt_file(self, fname, dtype=None):
        """
        Carries out the actual reading
        """
        with open(fname, 'r') as f:
            text = f.read()
            numbers = numpy.fromstring(text, dtype='int8')
            assert(self.data_dim == 128), "one-hot encoded ASCII required"
            onehots = numpy.zeros((self.data_dim, numbers.shape[0]))
            for i in range(numbers.shape[0]):
                onehots[numbers[i], i] = 1

        array = onehots
        return array
    def write_batches(self, name, offset, labels, imfiles):
        pool = Pool(processes=self.num_workers)
        npts = -(-len(imfiles) // self.macro_size)
        starts = [i * self.macro_size for i in range(npts)]
        imfiles = [imfiles[s : s + self.macro_size] for s in starts]
        labels = [{k: v[s : s + self.macro_size] for k, v in labels.iteritems()} for s in starts]

        print("Writing %d %s batches..." % (len(imfiles), name))
        for i, jpeg_file_batch in enumerate(imfiles):
            proc_img_func = functools.partial(proc_img, self.target_size)
            jpeg_strings = pool.map(proc_img_func, jpeg_file_batch)
            bfile = os.path.join(self.out_dir, "%s%d" % (self.batch_prefix, offset + i))
            self.write_binary(jpeg_strings, labels[i], bfile)
        pool.close()
Exemple #47
0
    def read_txt_file(self, fname, dtype=None):
        """
        Carries out the actual reading
        """
        with open(fname, 'r') as f:
            text = f.read()
            numbers = numpy.fromstring(text, dtype='int8')
            assert (self.data_dim == 128), "one-hot encoded ASCII required"
            onehots = numpy.zeros((self.data_dim, numbers.shape[0]))
            for i in range(numbers.shape[0]):
                onehots[numbers[i], i] = 1

        array = onehots
        return array
Exemple #48
0
    def load_data(self, shape):
        data = np.zeros(shape, dtype='float32')
        labels = np.zeros(shape[0], dtype='float32')
        ncircles = shape[0] / 2

        for row in range(0, ncircles):
            # Make circles.
            rad = np.random.randint(self.minrad, self.maxrad)
            self.circle(data[row], rad)

        for row in range(ncircles, shape[0]):
            # Make ellipses.
            while True:
                xrad, yrad = np.random.randint(self.minrad, self.maxrad, 2)
                if xrad != yrad:
                    break
            self.ellipse(data[row], xrad, yrad)
            labels[row] = 1

        data /= 255
        onehot = np.zeros((len(labels), self.nout), dtype='float32')
        for col in range(self.nout):
            onehot[:, col] = (labels == col)
        return (data, onehot)
Exemple #49
0
    def write_binary(self, jpegs, labels, ofname):
        num_imgs = len(jpegs)
        keylist = ['l_id']
        with open(ofname, 'wb') as f:
            f.write(struct.pack('I', num_imgs))
            f.write(struct.pack('I', len(keylist)))

            for key in keylist:
                ksz = len(key)
                f.write(struct.pack('L' + 'B' * ksz, ksz, *bytearray(key)))
                f.write(struct.pack('I' * num_imgs, *labels[key]))

            for i in range(num_imgs):
                jsz = len(jpegs[i])
                bin = struct.pack('I' + 'B' * jsz, jsz, *bytearray(jpegs[i]))
                f.write(bin)
Exemple #50
0
    def write_binary(self, jpegs, labels, ofname):
        num_imgs = len(jpegs)
        keylist = ["l_id"]
        with open(ofname, "wb") as f:
            f.write(struct.pack("I", num_imgs))
            f.write(struct.pack("I", len(keylist)))

            for key in keylist:
                ksz = len(key)
                f.write(struct.pack("L" + "B" * ksz, ksz, *bytearray(key)))
                f.write(struct.pack("I" * num_imgs, *labels[key]))

            for i in range(num_imgs):
                jsz = len(jpegs[i])
                bin = struct.pack("I" + "B" * jsz, jsz, *bytearray(jpegs[i]))
                f.write(bin)
    def write_csv_files(self):
        # image_idx : split
        split_file = np.loadtxt(os.path.join(self.dataset_dir, 'train_test_val_split.txt'), delimiter=' ')
        # image_idx : file_name
        images_key = np.loadtxt(os.path.join(self.dataset_dir, 'images.txt'), delimiter=' ', dtype='str')
        # image_idx : class_idx
        image_class_labels = np.loadtxt(os.path.join(self.dataset_dir, 'image_class_labels.txt'), delimiter=' ') - 1
        # image_idx : class_name
        self.label_names = [x.strip()[x.index(' ')+1:] for x in open(os.path.join(self.dataset_dir, 'classes.txt'), 'r').readlines()]

        self.nclass = len(self.label_names)
        self.label_dict = dict(zip(self.label_names, range(self.nclass)))

        # Get the labels as the subdirs
        tlines = []
        tslines = []
        vlines = []
        splits = {0:tlines, 1:tslines, 2:vlines}
        for i in xrange(split_file.shape[0]):
            if self.class_samples_max and i > self.class_samples_max:
                break
            full_filename = os.path.join(os.path.join(self.dataset_dir, 'images'), images_key[i, 1])
            splits[split_file[i, 1]].append((full_filename, image_class_labels[i, 1]))

        np.random.shuffle(tlines)

        if not os.path.exists(self.out_dir):
            os.makedirs(self.out_dir)

        for ff, ll in zip([self.train_file, self.test_file, self.val_file], [tlines, tslines, vlines]):
            with gzip.open(ff, 'wb') as f:
                f.write('filename,l_id\n')
                for tup in ll:
                    f.write('{},{}\n'.format(*tup))

        self.train_nrec = len(tlines)
        self.ntrain = -(-self.train_nrec // self.macro_size)
        self.train_start = 0

        self.test_nrec = len(tslines)
        self.ntest = -(-self.test_nrec // self.macro_size)
        self.test_start = self.train_start + self.ntrain + 1

        self.val_nrec = len(vlines)
        self.nval = -(-self.val_nrec // self.macro_size)
        self.val_start = self.test_start + self.ntest + 1