Beispiel #1
0
	def crop_pickle(self,pickle_name,func=1,bias=[[0,0]],nm=5):
		'''
		func=1: arraycrop
		func=2: randomcrop
		'''
		with open(pickle_name,'rb') as f:
			save = pickle.load(f):
			data = save['image']
			del save
		images = dict()
		if func == 1:
			self.set_boxes(bias)
			for name, value in data.iteritems():
				cropped_images = self.arraycrop(value)
				for key, cropped_image in enumerate(cropped_images):
					images.setdefault(str(key)+'-'+name,cropped_image)

		elif func == 2:
			for name, value in data.iteritems():
				cropped_images = self.randomcrop(value,nm)
				for key, cropped_image in enumerate(cropped_images):
					images.setdefault(str(key)+'-'+name,cropped_image)

		with open('new_'+pickle_name,'wb') as f:
			save={
			'image': images
			}
			pickle.dump(save,f,pickle.HIGHEST_PROTOCOL)
			f.close()				
    def save_model(self, modelfile):
        with open(modelfile, "wb") as f:
            cPickle.dump(self.layers, f, protocol=cPickle.HIGHEST_PROTOCOL)

        with open("params_" + modelfile, "wb") as f:
            for layer_key in self.layers.keys():
                cPickle.dump(self.layers[layer_key].params, f, protocol=cPickle.HIGHEST_PROTOCOL)
Beispiel #3
0
def test_simple():
    fig = plt.figure()
    # un-comment to debug
#    recursive_pickle(fig)
    pickle.dump(fig, BytesIO(), pickle.HIGHEST_PROTOCOL)

    ax = plt.subplot(121)
    pickle.dump(ax, BytesIO(), pickle.HIGHEST_PROTOCOL)

    ax = plt.axes(projection='polar')
    plt.plot(list(xrange(10)), label='foobar')
    plt.legend()

#    recursive_pickle(fig)
    pickle.dump(ax, BytesIO(), pickle.HIGHEST_PROTOCOL)

#    ax = plt.subplot(121, projection='hammer')
#    recursive_pickle(ax, 'figure')
#    pickle.dump(ax, BytesIO(), pickle.HIGHEST_PROTOCOL)

    plt.figure()
    plt.bar(left=list(xrange(10)), height=list(xrange(10)))
    pickle.dump(plt.gca(), BytesIO(), pickle.HIGHEST_PROTOCOL)

    fig = plt.figure()
    ax = plt.axes()
    plt.plot(list(xrange(10)))
    ax.set_yscale('log')
    pickle.dump(fig, BytesIO(), pickle.HIGHEST_PROTOCOL)
Beispiel #4
0
 def save(self, fn, compress=True):
     if compress and not fn.strip().lower().endswith('.gz'):
         fn = fn + '.gz'
     if compress:
         pickle.dump(self, gzip.open(fn, 'wb'))
     else:
         pickle.dump(self, open(fn, 'wb'))
def train_model(args):
	data_loader = InputHandler(args.data_dir, args.batch_size, args.result_length)
	args.vocabulary_size = data_loader.vocabulary_size

	# Save the original files, so that we can load the model when sampling
	with open(os.path.join(args.snapshots_dir, CONFIGURATION_FILE), 'wb') as f:
		cPickle.dump(args, f)
	with open(os.path.join(args.snapshots_dir, WORDS_VOCABULARY_FILE), 'wb') as f:
		cPickle.dump((data_loader.words, data_loader.vocabulary), f)

	model = RNNModel(args.rnn_size, args.network_depth, args.batch_size, args.result_length,
					 args.vocabulary_size, args.gradient)

	with tf.Session() as session:
		tf.initialize_all_variables().run()
		saver = tf.train.Saver(tf.all_variables())
		for e in range(args.num_epochs):
			session.run(tf.assign(model.lr, args.training_rate * (args.decay_rate ** e)))
			data_loader.set_batch_pointer_to_zero()
			state = model.initial_state.eval()

			for b in range(data_loader.num_batches):
				x, y = data_loader.get_next_batch()
				feed = {model.input_data: x, model.targets: y, model.initial_state: state}
				train_loss, state, _ = session.run([model.cost, model.final_state, model.train_op], feed)
				if (e * data_loader.num_batches + b) % args.snapshot == 0 \
						or (e==args.num_epochs-1 and b == data_loader.num_batches-1): # save for the last result
					snapshot_path = os.path.join(args.snapshots_dir, 'model.ckpt')
					saver.save(session, snapshot_path, global_step = e * data_loader.num_batches + b)
					print("Model snapshot was taken to {}".format(snapshot_path))
Beispiel #6
0
    def save(self, file_name="results"):
        """Persist the results.

        :param file_name: str, The name for the save file.
        """
        file_name = file_name + ".p"
        pickle.dump(self, open(file_name, 'wb'), protocol=2)
Beispiel #7
0
def test_complete():
    fig = plt.figure('Figure with a label?', figsize=(10, 6))

    plt.suptitle('Can you fit any more in a figure?')

    # make some arbitrary data
    x, y = np.arange(8), np.arange(10)
    data = u = v = np.linspace(0, 10, 80).reshape(10, 8)
    v = np.sin(v * -0.6)

    plt.subplot(3, 3, 1)
    plt.plot(list(xrange(10)))

    plt.subplot(3, 3, 2)
    plt.contourf(data, hatches=['//', 'ooo'])
    plt.colorbar()

    plt.subplot(3, 3, 3)
    plt.pcolormesh(data)

    plt.subplot(3, 3, 4)
    plt.imshow(data)

    plt.subplot(3, 3, 5)
    plt.pcolor(data)

    plt.subplot(3, 3, 6)
    plt.streamplot(x, y, u, v)

    plt.subplot(3, 3, 7)
    plt.quiver(x, y, u, v)

    plt.subplot(3, 3, 8)
    plt.scatter(x, x**2, label='$x^2$')
    plt.legend(loc='upper left')

    plt.subplot(3, 3, 9)
    plt.errorbar(x, x * -0.5, xerr=0.2, yerr=0.4)

    ###### plotting is done, now test its pickle-ability #########

    # Uncomment to debug any unpicklable objects. This is slow (~200 seconds).
#    recursive_pickle(fig)

    result_fh = BytesIO()
    pickle.dump(fig, result_fh, pickle.HIGHEST_PROTOCOL)

    plt.close('all')

    # make doubly sure that there are no figures left
    assert_equal(plt._pylab_helpers.Gcf.figs, {})

    # wind back the fh and load in the figure
    result_fh.seek(0)
    fig = pickle.load(result_fh)

    # make sure there is now a figure manager
    assert_not_equal(plt._pylab_helpers.Gcf.figs, {})

    assert_equal(fig.get_label(), 'Figure with a label?')
Beispiel #8
0
 def setUp(self):
     numpy.random.seed(9 + 5 + 2015)
     self.train_features_mock = [
         numpy.random.randint(0, 256, (10, 3, 32, 32)).astype('uint8')
         for i in range(5)]
     self.train_targets_mock = [
         numpy.random.randint(0, 10, (10,)).astype('uint8')
         for i in range(5)]
     self.test_features_mock = numpy.random.randint(
         0, 256, (10, 3, 32, 32)).astype('uint8')
     self.test_targets_mock = numpy.random.randint(
         0, 10, (10,)).astype('uint8')
     self.tempdir = tempfile.mkdtemp()
     cwd = os.getcwd()
     os.chdir(self.tempdir)
     os.mkdir('cifar-10-batches-py')
     for i, (x, y) in enumerate(zip(self.train_features_mock,
                                    self.train_targets_mock)):
         filename = os.path.join(
             'cifar-10-batches-py', 'data_batch_{}'.format(i + 1))
         with open(filename, 'wb') as f:
             cPickle.dump({'data': x, 'labels': y}, f)
     filename = os.path.join('cifar-10-batches-py', 'test_batch')
     with open(filename, 'wb') as f:
         cPickle.dump({'data': self.test_features_mock,
                       'labels': self.test_targets_mock},
                      f)
     with tarfile.open('cifar-10-python.tar.gz', 'w:gz') as tar_file:
         tar_file.add('cifar-10-batches-py')
     os.chdir(cwd)
Beispiel #9
0
 def setUp(self):
     numpy.random.seed(9 + 5 + 2015)
     self.train_features_mock = numpy.random.randint(
         0, 256, (10, 3, 32, 32)).astype('uint8')
     self.train_fine_labels_mock = numpy.random.randint(
         0, 100, (10,)).astype('uint8')
     self.train_coarse_labels_mock = numpy.random.randint(
         0, 20, (10,)).astype('uint8')
     self.test_features_mock = numpy.random.randint(
         0, 256, (10, 3, 32, 32)).astype('uint8')
     self.test_fine_labels_mock = numpy.random.randint(
         0, 100, (10,)).astype('uint8')
     self.test_coarse_labels_mock = numpy.random.randint(
         0, 20, (10,)).astype('uint8')
     self.tempdir = tempfile.mkdtemp()
     cwd = os.getcwd()
     os.chdir(self.tempdir)
     os.mkdir('cifar-100-python')
     filename = os.path.join('cifar-100-python', 'train')
     with open(filename, 'wb') as f:
         cPickle.dump({'data': self.train_features_mock.reshape((10, -1)),
                       'fine_labels': self.train_fine_labels_mock,
                       'coarse_labels': self.train_coarse_labels_mock}, f)
     filename = os.path.join('cifar-100-python', 'test')
     with open(filename, 'wb') as f:
         cPickle.dump({'data': self.test_features_mock.reshape((10, -1)),
                       'fine_labels': self.test_fine_labels_mock,
                       'coarse_labels': self.test_coarse_labels_mock}, f)
     with tarfile.open('cifar-100-python.tar.gz', 'w:gz') as tar_file:
         tar_file.add('cifar-100-python')
     os.chdir(cwd)
def store_and_or_load_data(outputdir, dataset, data_dir):
    save_path = os.path.join(outputdir, dataset + '_Manager.pkl')
    if not os.path.exists(save_path):
        lock = lockfile.LockFile(save_path)
        while not lock.i_am_locking():
            try:
                lock.acquire(timeout=60)  # wait up to 60 seconds
            except lockfile.LockTimeout:
                lock.break_lock()
                lock.acquire()
        print('I locked', lock.path)
        # It is not yet sure, whether the file already exists
        try:
            if not os.path.exists(save_path):
                D = SimpleDataManager(dataset, data_dir, verbose=True)
                fh = open(save_path, 'w')
                pickle.dump(D, fh, -1)
                fh.close()
            else:
                D = pickle.load(open(save_path, 'r'))
        except Exception:
            raise
        finally:
            lock.release()
    else:
        D = pickle.load(open(save_path, 'r'))
        print('Loaded data')
    return D
Beispiel #11
0
    def _run_tmva_training(self, info):
        """
        Run subprocess to train tmva factory

        :param info: class with additional information
        """
        tmva_process = subprocess.Popen(
            'cd {directory}; {executable} -c "from rep.estimators import _tmvaFactory; _tmvaFactory.main()"'.format(
                directory=info.directory,
                executable=sys.executable),
            stdin=PIPE, stdout=PIPE, stderr=subprocess.STDOUT,
            shell=True)

        cPickle.dump(self, tmva_process.stdin)
        cPickle.dump(info, tmva_process.stdin)
        stdout, stderr = tmva_process.communicate()
        assert tmva_process.returncode == 0, \
            'ERROR: TMVA process is incorrect finished \n LOG: %s \n %s' % (stderr, stdout)

        assert 'TrainTree' in root_numpy.list_trees(os.path.join(info.directory, info.tmva_root)), \
            'ERROR: Result file has not TrainTree'

        xml_filename = os.path.join(info.directory, 'weights',
                                    '{job}_{name}.weights.xml'.format(job=info.tmva_job, name=self._method_name))
        with open(xml_filename, 'r') as xml_file:
            self.formula_xml = xml_file.read()
Beispiel #12
0
def create_content_dir():
    """
    Make empty files for colnames.pkl, colnames_all.pkl and archfiles.db3
    for the current content type ft['content'].

    This only works within the development (git) directory in conjunction
    with the --create option.
    """
    dirname = msid_files['contentdir'].abs
    if not os.path.exists(dirname):
        logger.info('Making directory {}'.format(dirname))
        os.makedirs(dirname)

    empty = set()
    if not os.path.exists(msid_files['colnames'].abs):
        with open(msid_files['colnames'].abs, 'wb') as f:
            pickle.dump(empty, f, protocol=0)
    if not os.path.exists(msid_files['colnames_all'].abs):
        with open(msid_files['colnames_all'].abs, 'wb') as f:
            pickle.dump(empty, f, protocol=0)

    if not os.path.exists(msid_files['archfiles'].abs):
        archfiles_def = open('archfiles_def.sql').read()
        filename = msid_files['archfiles'].abs
        logger.info('Creating db {}'.format(filename))
        db = Ska.DBI.DBI(dbi='sqlite', server=filename, autocommit=False)
        db.execute(archfiles_def)
        db.commit()
Beispiel #13
0
def cached_yaml_load(path):
    """
    Load a pickled YAML file from cache.

    :param str path: The path to load.
    :returns: The loaded YAML file, possibly from cache.
    :rtype: dict
    """
    path = os.path.abspath(path)

    ho = hashlib.sha256()
    ho.update(path.encode('UTF-8'))
    h = ho.hexdigest()

    if not os.path.exists(CACHE_DIR):
        os.makedirs(CACHE_DIR)

    p = os.path.join(CACHE_DIR, h)
    if os.path.exists(p):
        # cache has file
        if os.path.getmtime(p) >= os.path.getmtime(path):
            # check that it's newer
            try:
                with open(p, 'rb') as file:
                    return pickle.load(file)
            except EOFError:
                os.remove(p)  # cache file corrupted, recreate it

    y = yaml.load(codecs.open(path, "r", encoding="utf-8"))
    with open(p, 'wb') as file:
        pickle.dump(y, file)
    return y
Beispiel #14
0
def train(epoch_num, output_dir, *args):

    model_name = args[0][0]
    file       = args[0][1]
    log_name   = "logs/" + model_name + ".log"
    model_name = output_dir + "training/" + model_name

    # direct stdout to log file
    log_file = open(log_name, 'a+')

    # TODO: gram_num here is a magic number!
    train_chars = LargeCharFeatureGenerator(file, 10);

    if os.path.isfile(model_name):
        with open(model_name,'rb') as f:
            model = cPickle.load(f)
    else:
        model = SimpleLSTM(train_chars.vocab_size)

    avg_loss = train_with_sgd(model,
                              train_chars,
                              nepoch=_NEPOCH,
                              learning_rate=_LEARNING_RATE,
                              mini_batch_size=_BATCH_SIZE)

    with open(model_name, 'wb') as f:
        cPickle.dump(model, f, protocol=cPickle.HIGHEST_PROTOCOL)

    log_file.write(avg_loss)
    log_file.close()
Beispiel #15
0
    def append_flipped_rois(self):
        """
        This method is irrelevant with database, so implement here
        Append flipped images to ROI database
        Note this method doesn't actually flip the 'image', it flip
        boxes instead
        """
        cache_file = os.path.join(self.cache_path, self.name + '_' + cfg.TRAIN.PROPOSAL_METHOD + '_roidb_flip.pkl')
        if os.path.exists(cache_file):
            with open(cache_file, 'rb') as fid:
                flip_roidb = cPickle.load(fid)
            print('{} gt flipped roidb loaded from {}'.format(self.name, cache_file))
        else:
            num_images = self.num_images
            widths = [PIL.Image.open(self.image_path_at(i)).size[0]
                      for i in range(num_images)]
            flip_roidb = []
            for i in range(num_images):
                boxes = self.roidb[i]['boxes'].copy()
                oldx1 = boxes[:, 0].copy()
                oldx2 = boxes[:, 2].copy()
                boxes[:, 0] = widths[i] - oldx2 - 1
                boxes[:, 2] = widths[i] - oldx1 - 1
                assert (boxes[:, 2] >= boxes[:, 0]).all()
                entry = {'boxes': boxes,
                         'gt_overlaps': self.roidb[i]['gt_overlaps'],
                         'gt_classes': self.roidb[i]['gt_classes'],
                         'flipped': True}
                flip_roidb.append(entry)
            with open(cache_file, 'wb') as fid:
                cPickle.dump(flip_roidb, fid, cPickle.HIGHEST_PROTOCOL)
            print('wrote gt flipped roidb to {}'.format(cache_file))

        self.roidb.extend(flip_roidb)
        self._image_index *= 2
Beispiel #16
0
def pickle_dump(data, filename):
    """
    Equivalent to pickle.dump(data, open(filename, 'w'))
    but closes the file to prevent filehandle leakage.
    """
    with open(filename, 'wb') as fh:
        pickle.dump(data, fh)
Beispiel #17
0
def train_loop():
    graph_generated = False
    while True:
        while data_q.empty():
            time.sleep(0.1)
        inp = data_q.get()
        if inp == 'end':  # quit
            res_q.put('end')
            break
        elif inp == 'train':  # restart training
            res_q.put('train')
            train = True
            continue
        elif inp == 'val':  # start validation
            pickle.dump(model, open(LOGPATH + 'model', 'wb'), -1)
            res_q.put('val')
            train = False
            continue


        x = xp.asarray(inp[0])
        y = xp.asarray(inp[1])

        if train:
            optimizer.zero_grads()
            loss = model.forward(x, y, train=True)
            loss.backward()
            optimizer.update()

        else:
            loss = model.forward(x, y, train=False)

        res_q.put(float(cuda.to_cpu(loss.data)))
        del loss, x, y
Beispiel #18
0
def getAllDataPickle(p_bForce=False):
    #get relevant paths
    trainGenreNames, trainGenrePaths = getAllGenrePaths(LIBRARY_PATH + 'train_small/')
    testGenreNames, testGenrePaths   = getAllGenrePaths(LIBRARY_PATH + 'test_small/')
    pickle_file =                                       LIBRARY_PATH + 'allData.pickle'
    
    #obtain data for each genre in their individual pickle file
    allPickledTrainFilenames = getIndividualGenrePickles(trainGenrePaths, p_bForce)
    allPickledTestFilenames  = getIndividualGenrePickles(testGenrePaths,  p_bForce)

    #merge and randomize data from all genres into wholedatasets for training, validation, and test
    wholeValidDataset, wholeValidLabels, wholeTrainDataset, wholeTrainLabels = getWholeDataFromIndividualGenrePickles(allPickledTrainFilenames, s_iTrainSize, s_iValid_size)
    _,                                _, wholeTestDataset,  wholeTestLabels  = getWholeDataFromIndividualGenrePickles(allPickledTestFilenames,  s_iTestSize)
    wholeTrainDataset, wholeTrainLabels = randomize(wholeTrainDataset, wholeTrainLabels)
    wholeTestDataset,  wholeTestLabels  = randomize(wholeTestDataset,  wholeTestLabels)
    wholeValidDataset, wholeValidLabels = randomize(wholeValidDataset, wholeValidLabels)

    #save the data for later reuse: 
    try:
        f = open(pickle_file, 'wb')
        save = {'wholeTrainDataset':    wholeTrainDataset,
                'wholeTrainLabels':     wholeTrainLabels,
                'wholeValidDataset':    wholeValidDataset,
                'wholeValidLabels':     wholeValidLabels,
                'wholeTestDataset':     wholeTestDataset,
                'wholeTestLabels':      wholeTestLabels}
        pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
        f.close()
    except Exception as e:
        print('Unable to save data to', pickle_file, ':', e)
        raise

    print ('\n================== DATASETS BUILT ================')
    return pickle_file
Beispiel #19
0
 def mutate_value(self):
     """
     Allows mutation of the value safely.
     """
     # Get the semaphore with an emergency timeout to detect deadlock conditions
     try:
         self.semaphore.acquire(self.TIMEOUT)
     except posix_ipc.BusyError:
         raise self.deadlock_error
     try:
         # Load the value from the shared memory segment (if populated)
         self.mmap.seek(0)
         # Memory can be empty but have a length.  Pickle opcodes
         # starts at 0x80.  If we read zero, memory was not
         # initiated yet.
         if not self.mmap.read_byte():
             value = self.DEFAULT_FACTORY()
         else:
             self.mmap.seek(0)
             try:
                 value = pickle.load(self.mmap)
             except EOFError:
                 value = self.DEFAULT_FACTORY()
         # Let the inside run
         yield value
         # Dump the value back into the shared memory segment
         self.mmap.seek(0)
         pickle.dump(value, self.mmap, protocol=2)
     finally:
         # Release semaphore
         self.semaphore.release()
Beispiel #20
0
def write_nl(model, nl_filename, **kwds):
    """
    Writes a Pyomo model in NL file format and stores
    information about the symbol map that allows it to be
    recovered at a later time for a Pyomo model with
    matching component names.
    """
    symbol_map_filename = nl_filename+".symbol_map.pickle"

    # write the model and obtain the symbol_map
    _, smap_id = model.write(nl_filename,
                             format=ProblemFormat.nl,
                             io_options=kwds)
    symbol_map = model.solutions.symbol_map[smap_id]

    # save a persistent form of the symbol_map (using pickle) by
    # storing the NL file label with a ComponentUID, which is
    # an efficient lookup code for model components (created
    # by John Siirola)
    tmp_buffer = {} # this makes the process faster
    symbol_cuid_pairs = tuple(
        (symbol, ComponentUID(var_weakref(), cuid_buffer=tmp_buffer))
        for symbol, var_weakref in symbol_map.bySymbol.items())
    with open(symbol_map_filename, "wb") as f:
        pickle.dump(symbol_cuid_pairs, f)

    return symbol_map_filename
Beispiel #21
0
    def save_model(self, model, idx, seed):
        # This should fail if no models directory exists
        filepath = os.path.join(self.get_model_dir(),
                                '%s.%s.model' % (seed, idx))

        with open(filepath, 'wb') as fh:
            pickle.dump(model, fh, -1)
Beispiel #22
0
def parse_ctgs(bestedges, frgtoctg):
    cache = "frgtoctg.cache"
    if need_update(frgtoctg, cache):
        reads_to_ctgs = {}
        frgtodeg = frgtoctg.replace(".frgctg", ".frgdeg")
        iidtouid = frgtoctg.replace(".posmap.frgctg", ".iidtouid")
        fp = open(iidtouid)
        frgstore = {}
        for row in fp:
            tag, iid, uid = row.split()
            if tag == "FRG":
                frgstore[uid] = int(iid)

        for pf, f in zip(("ctg", "deg"), (frgtoctg, frgtodeg)):
            fp = open(f)
            logging.debug("Parse posmap file `{0}`".format(f))
            for row in fp:
                frg, ctg = row.split()[:2]
                frg = frgstore[frg]
                reads_to_ctgs[frg] = pf + ctg
            logging.debug("Loaded mapping: {0}".format(len(reads_to_ctgs)))

        fw = open(cache, "w")
        dump(reads_to_ctgs, fw)
        fw.close()
        logging.debug("Contig mapping written to `{0}`".format(cache))

    reads_to_ctgs = load(open(cache))
    logging.debug("Contig mapping loaded from `{0}`".format(cache))
    return reads_to_ctgs
def fetch_train_thoughts(m, pcs, batches, name="trainthoughts"):
    all_thoughts = []
    for i in range(batches):
        ipt, opt = multi_training.getPieceBatch(pcs)
        thoughts = m.update_thought_fun(ipt, opt)
        all_thoughts.append((ipt, opt, thoughts))
    pickle.dump(all_thoughts, open('output/' + name + '.p', 'wb'))
Beispiel #24
0
def get_abinit_variables():
    """Returns the database with the description of the ABINIT variables."""
    global __VARS_DATABASE

    if __VARS_DATABASE is None: 
        pickle_file = os.path.join(os.getenv("HOME"), ".abinit", "abipy", "abinit_vars.pickle")
        
        if os.path.exists(pickle_file): 
            #print("Reading from pickle")
            with open(pickle_file, "rb") as fh:
                __VARS_DATABASE = pickle.load(fh)

        else:
            # Make dir and file if not present.
            if not os.path.exists(os.path.dirname(pickle_file)):
                os.makedirs(os.path.dirname(pickle_file))

            #print("Reading database from YAML file and generating pickle version. It may take a while...")
            from abipy import data as abidata
            yaml_file = abidata.var_file('abinit_vars.yml')
            with open(yaml_file, "rt") as fh:
                var_list = yaml.load(fh)

            # Build ordered dict with variables in alphabetical order.
            var_list = sorted(var_list, key=lambda v: v.varname)
            __VARS_DATABASE = VariableDatabase([(v.varname, v) for v in var_list])

            # Save object to pickle file so that can we can reload it from pickle instead of yaml (slower)
            with open(pickle_file, "wb") as fh:
                pickle.dump(__VARS_DATABASE, fh)

    return __VARS_DATABASE
    def load_additional_args(self, config):
        """
        """
        self.set_attribute(config, 'request_powermin', 'General',
                           'power min', cast='float')
        self.set_attribute(config, 'request_powermax', 'General',
                           'power max', cast='float')

        # read in the coefficients from file
        coeffs = self.config_get(config, 'PowerMeter', 'coefficients')
        if coeffs is not None:
            self.power_meter_calibration = MeterCalibration(coeffs)

        coeffs = self.config_get(config, 'PowerOutput', 'coefficients')
        if coeffs is not None:

            p = os.path.join(paths.hidden_dir, '{}_power_calibration'.format(self.name.split('.')[0]))

            obj = MeterCalibration(coeffs)
            # dump to the hidden dir
            # the manager will use it directly
            try:
                self.info('loading power calibration from config file')
                with open(p, 'wb') as f:
                    pickle.dump(obj, f)
            except (OSError, pickle.PickleError):
                self.warning('failed loading power output calibration')

        return super(FusionsCO2LogicBoard, self).load_additional_args(config)
Beispiel #26
0
    def _run_tmva_predict(info, data):
        """
        Run subprocess to train tmva factory

        :param info: class with additional information
        """
        tmva_process = subprocess.Popen(
            'cd "{directory}"; {executable} -c "from rep.estimators import _tmvaReader; _tmvaReader.main()"'.format(
                directory=info.directory,
                executable=sys.executable),
            stdin=PIPE, stdout=PIPE, stderr=subprocess.STDOUT,
            shell=True)

        try:
            cPickle.dump(info, tmva_process.stdin)
            cPickle.dump(data, tmva_process.stdin)
        except:
            # Doing nothing, there is check later.
            pass
        stdout, stderr = tmva_process.communicate()
        assert tmva_process.returncode == 0, \
            'ERROR: TMVA process is incorrect finished \n LOG: %s \n %s' % (stderr, stdout)
        with open(info.result_filename, 'rb') as predictions_file:
            predictions = cPickle.load(predictions_file)
        return predictions
Beispiel #27
0
def save_classifier(cl, fn, use_joblib=True, **kwargs):
    """Save a classifier to disk.

    Parameters
    ----------
    cl : classifier object
        Pickleable object or a classify.VigraRandomForest object.
    fn : string
        Writeable path/filename.
    use_joblib : bool, optional
        Whether to prefer joblib persistence to pickle.
    kwargs : keyword arguments
        Keyword arguments to be passed on to either `pck.dump` or 
        `joblib.dump`.

    Returns
    -------
    None

    Notes
    -----
    For joblib persistence, `compress=3` is the default.
    """
    if isinstance(cl, VigraRandomForest):
        cl.save_to_disk(fn)
    elif use_joblib and sklearn_available:
        if "compress" not in kwargs:
            kwargs["compress"] = 3
        joblib.dump(cl, fn, **kwargs)
    else:
        with open(fn, "wb") as f:
            pck.dump(cl, f, protocol=kwargs.get("protocol", 2))
Beispiel #28
0
def create_pickle(data_folders, force=False):
    """Function for converting data into separate pickle files for each label.
    data_folders is the list of folder names of all classes.
    Set force = False if pickle files are already created and are not to be overwritten.
    Set force = True to overwrite already created pickle files.
    """
    # List of names of pickle files for individual classes
    dataset_names = []

    for folder in data_folders:
        set_filename = folder + '.pickle'
        dataset_names.append(set_filename)

        if os.path.exists(set_filename) and not force:
            print('%s already present - Skipping pickling.' % set_filename)
        else:
            print('Pickling %s.' % set_filename)
            dataset = load_emotion(folder)
            try:
                with open(set_filename, 'wb') as f:
                    pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
            except Exception as e:
                print('Unable to save data to', set_filename, ':', e)

    return dataset_names
Beispiel #29
0
    def _run_tmva_training(self, info, X, y, sample_weight):
        """
        Run subprocess to train tmva factory

        :param info: class with additional information
        """
        tmva_process = subprocess.Popen(
            'cd "{directory}"; {executable} -c "from rep.estimators import _tmvaFactory; _tmvaFactory.main()"'.format(
                directory=info.directory,
                executable=sys.executable),
            stdin=PIPE, stdout=PIPE, stderr=subprocess.STDOUT,
            shell=True)

        try:
            cPickle.dump(self, tmva_process.stdin)
            cPickle.dump(info, tmva_process.stdin)
            cPickle.dump(X, tmva_process.stdin)
            cPickle.dump(y, tmva_process.stdin)
            cPickle.dump(sample_weight, tmva_process.stdin)
        except:
            # continuing, next we check the output of process
            pass
        stdout, stderr = tmva_process.communicate()
        assert tmva_process.returncode == 0, \
            'ERROR: TMVA process is incorrect finished \n LOG: %s \n %s' % (stderr, stdout)

        xml_filename = os.path.join(info.directory, 'weights',
                                    '{job}_{name}.weights.xml'.format(job=info.tmva_job, name=self._method_name))
        with open(xml_filename, 'r') as xml_file:
            self.formula_xml = xml_file.read()
Beispiel #30
0
def train(args):
    data_loader = TextLoader(args.data_dir, args.batch_size, args.seq_length)
    args.vocab_size = data_loader.vocab_size

    with open(os.path.join(args.save_dir, 'config.pkl'), 'wb') as f:
        cPickle.dump(args, f)
    with open(os.path.join(args.save_dir, 'chars_vocab.pkl'), 'wb') as f:
        cPickle.dump((data_loader.chars, data_loader.vocab), f)

    model = Model(args)

    with tf.Session() as sess:
        tf.initialize_all_variables().run()
        saver = tf.train.Saver(tf.all_variables())
        for e in range(args.num_epochs):
            sess.run(tf.assign(model.lr, args.learning_rate * (args.decay_rate ** e)))
            data_loader.reset_batch_pointer()
            state = model.initial_state.eval()
            for b in range(data_loader.num_batches):
                start = time.time()
                x, y = data_loader.next_batch()
                feed = {model.input_data: x, model.targets: y, model.initial_state: state}
                train_loss, state, _ = sess.run([model.cost, model.final_state, model.train_op], feed)
                end = time.time()
                print("{}/{} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \
                    .format(e * data_loader.num_batches + b,
                            args.num_epochs * data_loader.num_batches,
                            e, train_loss, end - start))
                if (e * data_loader.num_batches + b) % args.save_every == 0:
                    checkpoint_path = os.path.join(args.save_dir, 'model.ckpt')
                    saver.save(sess, checkpoint_path, global_step = e * data_loader.num_batches + b)
                    print("model saved to {}".format(checkpoint_path))
Beispiel #31
0
    return dataset


train_data = load_data('ptb.train.txt')
if args.test:
    train_data = train_data[:100]
valid_data = load_data('ptb.valid.txt')
if args.test:
    valid_data = valid_data[:100]
test_data = load_data('ptb.test.txt')
if args.test:
    test_data = test_data[:100]

print('#vocab =', len(vocab))
with open('vocab.bin', 'wb') as f:
    pickle.dump(vocab, f)

# Prepare RNNLM model, defined in net.py
lm = net.RNNLM(len(vocab), n_units)
model = L.Classifier(lm)
model.compute_accuracy = False  # we only want the perplexity
for param in model.params():
    data = param.data
    data[:] = np.random.uniform(-0.1, 0.1, data.shape)
if args.gpu >= 0:
    cuda.get_device(args.gpu).use()
    model.to_gpu()

# Setup optimizer
optimizer = optimizers.SGD(lr=1.)
optimizer.setup(model)
def save_pickle(fname, tweets):
    with open(fname + '.pickle', 'wb') as f:
        pickle.dump(tweets, f, pickle.HIGHEST_PROTOCOL)
Beispiel #33
0
def train_lstm(
        dim_proj=128,  # word embeding dimension and LSTM number of hidden units.
        patience=10,  # Number of epoch to wait before early stop if no progress
        max_epochs=5000,  # The maximum number of epoch to run
        dispFreq=10,  # Display to stdout the training progress every N updates
        decay_c=0.,  # Weight decay for the classifier applied to the U weights.
        lrate=0.0001,  # Learning rate for sgd (not used for adadelta and rmsprop)
        n_words=10000,  # Vocabulary size
        optimizer=adadelta,  # sgd, adadelta and rmsprop available, sgd very hard to use, not recommanded (probably need momentum and decaying learning rate).
        encoder='lstm',  # TODO: can be removed must be lstm.
        saveto='lstm_model.npz',  # The best model will be saved there
        validFreq=370,  # Compute the validation error after this number of update.
        saveFreq=1110,  # Save the parameters after every saveFreq updates
        maxlen=100,  # Sequence longer then this get ignored
        batch_size=16,  # The batch size during training.
        valid_batch_size=64,  # The batch size used for validation/test set.
        dataset='authors2',

        # Parameter for extra option
        noise_std=0.,
        use_dropout=True,  # if False slightly faster, but worst test error
        # This frequently need a bigger model.
    reload_model=None,  # Path to a saved model we want to start from.
        test_size=-1,  # If >0, we keep only this number of test example.
):

    # Model options
    model_options = locals().copy()
    print("model options", model_options)

    load_data, prepare_data = get_dataset(dataset)

    print('Loading data')
    train, valid, test = load_data(n_words=n_words,
                                   valid_portion=0.05,
                                   maxlen=maxlen)
    if test_size > 0:
        # The test set is sorted by size, but we want to keep random
        # size example.  So we must select a random selection of the
        # examples.
        idx = numpy.arange(len(test[0]))
        numpy.random.shuffle(idx)
        idx = idx[:test_size]
        test = ([test[0][n] for n in idx], [test[1][n] for n in idx])

    #print(len(train[0]));
    #print(len(train[1]));
    #print(len(test[0]));
    #print(len(test[1]));
    ydim = numpy.max(train[1]) + 1

    model_options['ydim'] = ydim

    print('Building model')
    # This create the initial parameters as numpy ndarrays.
    # Dict name (string) -> numpy ndarray
    params = init_params(model_options)

    if reload_model:
        load_params('lstm_model.npz', params)

    # This create Theano Shared Variable from the parameters.
    # Dict name (string) -> Theano Tensor Shared Variable
    # params and tparams have different copy of the weights.
    tparams = init_tparams(params)

    # use_noise is for dropout
    (use_noise, x, mask, y, f_pred_prob, f_pred,
     cost) = build_model(tparams, model_options)

    if decay_c > 0.:
        decay_c = theano.shared(numpy_floatX(decay_c), name='decay_c')
        weight_decay = 0.
        weight_decay += (tparams['U']**2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    f_cost = theano.function([x, mask, y], cost, name='f_cost')

    grads = tensor.grad(cost, wrt=list(tparams.values()))
    f_grad = theano.function([x, mask, y], grads, name='f_grad')

    lr = tensor.scalar(name='lr')
    f_grad_shared, f_update = optimizer(lr, tparams, grads, x, mask, y, cost)

    print('Optimization')

    kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size)
    kf_test = get_minibatches_idx(len(test[0]), valid_batch_size)

    print("%d train examples" % len(train[0]))
    print("%d valid examples" % len(valid[0]))
    print("%d test examples" % len(test[0]))

    history_errs = []
    best_p = None
    bad_count = 0

    if validFreq == -1:
        validFreq = len(train[0]) // batch_size
    if saveFreq == -1:
        saveFreq = len(train[0]) // batch_size

    uidx = 0  # the number of update done
    estop = False  # early stop
    start_time = time.time()
    try:
        for eidx in range(max_epochs):
            n_samples = 0

            # Get new shuffled index for the training set.
            kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True)

            for _, train_index in kf:
                uidx += 1
                use_noise.set_value(1.)

                # Select the random examples for this minibatch
                y = [train[1][t] for t in train_index]
                x = [train[0][t] for t in train_index]

                # Get the data in numpy.ndarray format
                # This swap the axis!
                # Return something of shape (minibatch maxlen, n samples)
                x, mask, y = prepare_data(x, y)
                n_samples += x.shape[1]

                cost = f_grad_shared(x, mask, y)
                f_update(lrate)

                if numpy.isnan(cost) or numpy.isinf(cost):
                    print('bad cost detected: ', cost)
                    return 1., 1., 1.

                if numpy.mod(uidx, dispFreq) == 0:
                    print('Epoch ', eidx, 'Update ', uidx, 'Cost ', cost)

                if saveto and numpy.mod(uidx, saveFreq) == 0:
                    print('Saving...')

                    if best_p is not None:
                        params = best_p
                    else:
                        params = unzip(tparams)
                    numpy.savez(saveto, history_errs=history_errs, **params)
                    pickle.dump(model_options, open('%s.pkl' % saveto, 'wb'),
                                -1)
                    print('Done')

                if numpy.mod(uidx, validFreq) == 0:
                    use_noise.set_value(0.)
                    train_err = pred_error(f_pred, prepare_data, train, kf)
                    valid_err = pred_error(f_pred, prepare_data, valid,
                                           kf_valid)
                    test_err = pred_error(f_pred, prepare_data, test, kf_test)

                    history_errs.append([valid_err, test_err])

                    if (best_p is None or valid_err <=
                            numpy.array(history_errs)[:, 0].min()):

                        best_p = unzip(tparams)
                        bad_counter = 0

                    print(('Train ', train_err, 'Valid ', valid_err, 'Test ',
                           test_err))

                    if (len(history_errs) > patience and valid_err >=
                            numpy.array(history_errs)[:-patience, 0].min()):
                        bad_counter += 1
                        if bad_counter > patience:
                            print('Early Stop!')
                            estop = True
                            break

            print('Seen %d samples' % n_samples)

            if estop:
                break

    except KeyboardInterrupt:
        print("Training interupted")

    end_time = time.time()
    if best_p is not None:
        zipp(best_p, tparams)
    else:
        best_p = unzip(tparams)

    use_noise.set_value(0.)
    kf_train_sorted = get_minibatches_idx(len(train[0]), batch_size)
    train_err = pred_error(f_pred, prepare_data, train, kf_train_sorted)
    valid_err = pred_error(f_pred, prepare_data, valid, kf_valid)
    test_err = pred_error(f_pred, prepare_data, test, kf_test)

    print('Train ', train_err, 'Valid ', valid_err, 'Test ', test_err)
    if saveto:
        numpy.savez(saveto,
                    train_err=train_err,
                    valid_err=valid_err,
                    test_err=test_err,
                    history_errs=history_errs,
                    **best_p)
    print('The code run for %d epochs, with %f sec/epochs' %
          ((eidx + 1), (end_time - start_time) / (1. * (eidx + 1))))
    print(('Training took %.1fs' % (end_time - start_time)), file=sys.stderr)
    return train_err, valid_err, test_err
Beispiel #34
0
def trainHMM_fromDir(dirPath, hmmModelName, mtWin, mtStep):
    '''
    This function trains a HMM model for segmentation-classification using a where WAV files and .segment (ground-truth files) are stored
    ARGUMENTS:
     - dirPath:        the path of the data diretory
     - hmmModelName:    the name of the HMM model to be stored
     - mtWin:        mid-term window size
     - mtStep:        mid-term window step
    RETURNS:
     - hmm:            an object to the resulting HMM
     - classNames:        a list of classNames

    After training, hmm, classNames, along with the mtWin and mtStep values are stored in the hmmModelName file
    '''

    flagsAll = numpy.array([])
    initializedFall = False
    classesAll = []
    # for each WAV file
    for i, f in enumerate(glob.glob(dirPath + os.sep + '*.wav')):
        wavFile = f
        # open for annotated file
        gtFile = f.replace('.wav', '.segments')
        # if current WAV file does not have annotation -> skip
        if not os.path.isfile(gtFile):
            continue
        [segStart, segEnd, segLabels] = readSegmentGT(gtFile)  # read GT data
        flags, classNames = segs2flags(segStart, segEnd, segLabels,
                                       mtStep)  # convert to flags
        # update classnames:
        for c in classNames:
            if c not in classesAll:
                classesAll.append(c)
        [Fs, x] = audioBasicIO.readAudioFile(wavFile)  # read audio data
        [F,
         _] = aF.mtFeatureExtraction(x, Fs, mtWin * Fs, mtStep * Fs,
                                     round(Fs * 0.050),
                                     round(Fs * 0.050))  # feature extraction

        lenF = F.shape[1]
        lenL = len(flags)
        MIN = min(lenF, lenL)
        F = F[:, 0:MIN]
        flags = flags[0:MIN]

        flagsNew = []
        for j, fl in enumerate(flags):  # append features and labels
            flagsNew.append(classesAll.index(classNames[flags[j]]))

        flagsAll = numpy.append(flagsAll, numpy.array(flagsNew))

        if not initializedFall:
            Fall = F
            initializedFall = True
        else:
            Fall = numpy.concatenate((Fall, F), axis=1)
    startprob, transmat, means, cov = trainHMM_computeStatistics(
        Fall, flagsAll)  # compute HMM statistics
    hmm = hmmlearn.hmm.GaussianHMM(startprob.shape[0], "diag")  # train HMM
    hmm.startprob_ = startprob
    hmm.transmat_ = transmat
    hmm.means_ = means
    hmm.covars_ = cov

    fo = open(hmmModelName, "wb")  # save HMM model
    cPickle.dump(hmm, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(classesAll, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(mtWin, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(mtStep, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    fo.close()

    return hmm, classesAll
Beispiel #35
0
 def finish(self):
     # type: () -> None
     # dump the coverage data to a pickle file too
     picklepath = path.join(self.outdir, 'undoc.pickle')
     with open(picklepath, 'wb') as dumpfile:
         pickle.dump((self.py_undoc, self.c_undoc), dumpfile)
Beispiel #36
0
 def sync_session(self):
     if self.resume_file:
         with open(self.resume_file, 'wb') as rf:
             cPickle.dump(self.scan_session, rf, 2)
Beispiel #37
0
def test_nest():
    crs = cimgt.GoogleTiles().crs
    z0 = cimg_nest.ImageCollection('aerial z0 test', crs)
    z0.scan_dir_for_imgs(os.path.join(_TEST_DATA_DIR, 'z_0'),
                         glob_pattern='*.png',
                         img_class=RoundedImg)

    z1 = cimg_nest.ImageCollection('aerial z1 test', crs)
    z1.scan_dir_for_imgs(os.path.join(_TEST_DATA_DIR, 'z_1'),
                         glob_pattern='*.png',
                         img_class=RoundedImg)

    z2 = cimg_nest.ImageCollection('aerial z2 test', crs)
    z2.scan_dir_for_imgs(os.path.join(_TEST_DATA_DIR, 'z_2'),
                         glob_pattern='*.png',
                         img_class=RoundedImg)

    # make sure all the images from z1 are contained by the z0 image. The
    # only reason this might occur is if the tfw files are handling
    # floating point values badly
    for img in z1.images:
        if not z0.images[0].bbox().contains(img.bbox()):
            raise IOError('The test images aren\'t all "contained" by the '
                          'z0 images, the nest cannot possibly work.\n '
                          'img {!s} not contained by {!s}\nExtents: {!s}; '
                          '{!s}'.format(img, z0.images[0], img.extent,
                                        z0.images[0].extent))
    nest_z0_z1 = cimg_nest.NestedImageCollection('aerial test', crs, [z0, z1])

    nest = cimg_nest.NestedImageCollection('aerial test', crs, [z0, z1, z2])

    z0_key = ('aerial z0 test', z0.images[0])

    assert_true(z0_key in nest_z0_z1._ancestry.keys())
    assert_equal(len(nest_z0_z1._ancestry), 1)

    # check that it has figured out that all the z1 images are children of
    # the only z0 image
    for img in z1.images:
        key = ('aerial z0 test', z0.images[0])
        assert_in(('aerial z1 test', img), nest_z0_z1._ancestry[key])

    x1_y0_z1, = [
        img for img in z1.images if img.filename.endswith('z_1/x_1_y_0.png')
    ]

    assert_equal((1, 0, 1), _tile_from_img(x1_y0_z1))

    assert_equal([(2, 0, 2), (2, 1, 2), (3, 0, 2), (3, 1, 2)],
                 sorted([
                     _tile_from_img(img)
                     for z, img in nest.subtiles(('aerial z1 test', x1_y0_z1))
                 ]))

    nest_from_config = gen_nest()
    # check that the the images in the nest from configuration are the
    # same as those created by hand.
    for name in nest_z0_z1._collections_by_name.keys():
        for img in nest_z0_z1._collections_by_name[name].images:
            collection = nest_from_config._collections_by_name[name]
            assert_in(img, collection.images)

    assert_equal(nest_z0_z1._ancestry, nest_from_config._ancestry)

    # check that a nest can be pickled and unpickled easily.
    s = io.BytesIO()
    pickle.dump(nest_z0_z1, s)
    s.seek(0)
    nest_z0_z1_from_pickle = pickle.load(s)

    assert_equal(nest_z0_z1._ancestry, nest_z0_z1_from_pickle._ancestry)
Beispiel #38
0
def update_msid_files(filetype, archfiles):
    colnames = pickle.load(open(msid_files['colnames'].abs))
    colnames_all = pickle.load(open(msid_files['colnames_all'].abs))
    old_colnames = colnames.copy()
    old_colnames_all = colnames_all.copy()

    # Setup db handle with autocommit=False so that error along the way aborts insert transactions
    db = Ska.DBI.DBI(dbi='sqlite',
                     server=msid_files['archfiles'].abs,
                     autocommit=False)

    # Get the last row number from the archfiles table
    out = db.fetchone('SELECT max(rowstop) FROM archfiles')
    row = out['max(rowstop)'] or 0
    last_archfile = db.fetchone('SELECT * FROM archfiles where rowstop=?',
                                (row, ))

    archfiles_overlaps = []
    dats = []
    archfiles_processed = []

    content_is_derived = (filetype['instrum'] == 'DERIVED')

    for i, f in enumerate(archfiles):
        get_data = (read_derived if content_is_derived else read_archfile)
        dat, archfiles_row = get_data(i, f, filetype, row, colnames, archfiles,
                                      db)
        if dat is None:
            continue

        # If creating new content type and there are no existing colnames, then
        # define the column names now.  Filter out any multidimensional
        # columns, including (typically) QUALITY.
        if opt.create and not colnames:
            colnames = set(dat.dtype.names)
            for colname in dat.dtype.names:
                if len(dat[colname].shape) > 1:
                    logger.info(
                        'Removing column {} from colnames because shape = {}'.
                        format(colname, dat[colname].shape))
                    colnames.remove(colname)

        # Ensure that the time gap between the end of the last ingested archive
        # file and the start of this one is less than opt.max_gap (or
        # filetype-based defaults).  If this fails then break out of the
        # archfiles processing but continue on to ingest any previously
        # successful archfiles
        if last_archfile is None:
            time_gap = 0
        else:
            time_gap = archfiles_row['tstart'] - last_archfile['tstop']
        max_gap = opt.max_gap
        if max_gap is None:
            if filetype['instrum'] in ['EPHEM', 'DERIVED']:
                max_gap = 601
            elif filetype['content'] == 'ACISDEAHK':
                max_gap = 10000
                # From P.Plucinsky 2011-09-23
                # If ACIS is executing an Event Histogram run while in FMT1,
                # the telemetry stream will saturate.  The amount of time for
                # an opening in the telemetry to appear such that DEA HKP
                # packets can get out is a bit indeterminate.  The histograms
                # integrate for 5400s and then they are telemetered.  I would
                # suggest 6000s, but perhaps you would want to double that to
                # 12000s.
            elif filetype['content'] in ['CPE1ENG', 'CCDM15ENG']:
                # 100 years => no max gap for safe mode telemetry or dwell mode telemetry
                max_gap = 100 * 3.1e7
            else:
                max_gap = 32.9
        if time_gap > max_gap:
            logger.warning(
                'WARNING: found gap of %.2f secs between archfiles %s and %s',
                time_gap, last_archfile['filename'], archfiles_row['filename'])
            if opt.create:
                logger.warning(
                    '       Allowing gap because of opt.create=True')
            elif DateTime() - DateTime(
                    archfiles_row['tstart']) > opt.allow_gap_after_days:
                # After 4 days (by default) just let it go through because this is
                # likely a real gap and will not be fixed by subsequent processing.
                # This can happen after normal sun mode to SIM products.
                logger.warning('       Allowing gap because arch file '
                               'start is more than {} days old'.format(
                                   opt.allow_gap_after_days))
            else:
                break
        elif time_gap < 0:
            # Overlapping archfiles - deal with this in append_h5_col
            archfiles_overlaps.append((last_archfile, archfiles_row))

        # Update the last_archfile values.
        last_archfile = archfiles_row

        # A very small number of archive files (a few) have a problem where the
        # quality column tform is specified as 3B instead of 17X (for example).
        # This breaks things, so in this case just skip the file.  However
        # since last_archfile is set above the gap check considers this file to
        # have been ingested.
        if not content_is_derived and dat['QUALITY'].shape[1] != len(
                dat.dtype.names):
            logger.warning(
                'WARNING: skipping because of quality size mismatch: %d %d' %
                (dat['QUALITY'].shape[1], len(dat.dtype.names)))
            continue

        # Mark the archfile as ingested in the database and add to list for
        # subsequent relocation into arch_files archive.  In the case of a gap
        # where ingest is stopped before all archfiles are processed, this will
        # leave files either in a tmp dir (HEAD) or in the stage dir (OCC).
        # In the latter case this allows for successful processing later when the
        # gap gets filled.
        archfiles_processed.append(f)
        if not opt.dry_run:
            db.insert(archfiles_row, 'archfiles')

        # Capture the data for subsequent storage in the hdf5 files
        dats.append(dat)

        # Update the running list of column names.  Colnames_all is the maximal (union)
        # set giving all column names seen in any file for this content type.  Colnames
        # was historically the minimal (intersection) set giving the list of column names
        # seen in every file, but as of 0.39 it is allowed to grow as well to accommodate
        # adding MSIDs in the TDB.  Include only 1-d columns, not things like AEPERR
        # in PCAD8ENG which is a 40-element binary vector.
        colnames_all.update(dat.dtype.names)
        colnames.update(name for name in dat.dtype.names
                        if dat[name].ndim == 1)

        row += len(dat)

    if dats:
        logger.verbose('Writing accumulated column data to h5 file at ' +
                       time.ctime())
        data_lens = set()
        processed_cols = set()
        for colname in colnames:
            ft['msid'] = colname
            if not os.path.exists(msid_files['msid'].abs):
                make_h5_col_file(dats, colname)
                if not opt.create:
                    # New MSID was found for this content type.  This must be associated with
                    # an update to the TDB.  Skip for the moment to ensure that other MSIDs
                    # are fully processed.
                    continue
            data_len = append_h5_col(dats, colname, archfiles_overlaps)
            data_lens.add(data_len)
            processed_cols.add(colname)

        if len(data_lens) != 1:
            raise ValueError(
                'h5 data length inconsistency {}, investigate NOW!'.format(
                    data_lens))

        # Process any new MSIDs (this is extremely rare)
        data_len = data_lens.pop()
        for colname in colnames - processed_cols:
            ft['msid'] = colname
            append_filled_h5_col(dats, colname, data_len)

    # Assuming everything worked now commit the db inserts that signify the
    # new archive files have been processed
    if not opt.dry_run:
        db.commit()

    # If colnames or colnames_all changed then give warning and update files.
    if colnames != old_colnames:
        logger.warning('WARNING: updating %s because colnames changed: %s' %
                       (msid_files['colnames'].abs, old_colnames ^ colnames))
        if not opt.dry_run:
            pickle.dump(colnames, open(msid_files['colnames'].abs, 'w'))
    if colnames_all != old_colnames_all:
        logger.warning(
            'WARNING: updating %s because colnames_all changed: %s' %
            (msid_files['colnames_all'].abs, colnames_all ^ old_colnames_all))
        if not opt.dry_run:
            pickle.dump(colnames_all, open(msid_files['colnames_all'].abs,
                                           'w'))

    return archfiles_processed
Beispiel #39
0
def sgd_optimization_mnist(learning_rate=0.13, n_epochs=1000,
                           dataset='mnist.pkl.gz',
                           batch_size=600):
    """
    Demonstrate stochastic gradient descent optimization of a log-linear
    model

    This is demonstrated on MNIST.

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: the path of the MNIST dataset file from
                 http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz

    """
    datasets = load_data(dataset)

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] // batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] // batch_size

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print('... building the model')

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch

    # generate symbolic variables for input (x and y represent a
    # minibatch)
    x = T.matrix('x')  # data, presented as rasterized images
    y = T.ivector('y')  # labels, presented as 1D vector of [int] labels

    # construct the logistic regression class
    # Each MNIST image has size 28*28
    classifier = LogisticRegression(input=x, n_in=28 * 28, n_out=10)

    # the cost we minimize during training is the negative log likelihood of
    # the model in symbolic format
    cost = classifier.negative_log_likelihood(y)

    # compiling a Theano function that computes the mistakes that are made by
    # the model on a minibatch
    test_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: test_set_x[index * batch_size: (index + 1) * batch_size],
            y: test_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )

    validate_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: valid_set_x[index * batch_size: (index + 1) * batch_size],
            y: valid_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )

    # compute the gradient of cost with respect to theta = (W,b)
    g_W = T.grad(cost=cost, wrt=classifier.W)
    g_b = T.grad(cost=cost, wrt=classifier.b)

    # start-snippet-3
    # specify how to update the parameters of the model as a list of
    # (variable, update expression) pairs.
    updates = [(classifier.W, classifier.W - learning_rate * g_W),
               (classifier.b, classifier.b - learning_rate * g_b)]

    # compiling a Theano function `train_model` that returns the cost, but in
    # the same time updates the parameter of the model based on the rules
    # defined in `updates`
    train_model = theano.function(
        inputs=[index],
        outputs=cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            y: train_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )
    # end-snippet-3

    ###############
    # TRAIN MODEL #
    ###############
    print('... training the model')
    # early-stopping parameters
    patience = 5000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                                  # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                  # considered significant
    validation_frequency = min(n_train_batches, patience // 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_validation_loss = numpy.inf
    test_score = 0.
    start_time = timeit.default_timer()

    done_looping = False
    epoch = 0
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in range(n_train_batches):

            minibatch_avg_cost = train_model(minibatch_index)
            # iteration number
            iter = (epoch - 1) * n_train_batches + minibatch_index

            if (iter + 1) % validation_frequency == 0:
                # compute zero-one loss on validation set
                validation_losses = [validate_model(i)
                                     for i in range(n_valid_batches)]
                this_validation_loss = numpy.mean(validation_losses)

                print(
                    'epoch %i, minibatch %i/%i, validation error %f %%' %
                    (
                        epoch,
                        minibatch_index + 1,
                        n_train_batches,
                        this_validation_loss * 100.
                    )
                )

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:
                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    best_validation_loss = this_validation_loss
                    # test it on the test set

                    test_losses = [test_model(i)
                                   for i in range(n_test_batches)]
                    test_score = numpy.mean(test_losses)

                    print(
                        (
                            '     epoch %i, minibatch %i/%i, test error of'
                            ' best model %f %%'
                        ) %
                        (
                            epoch,
                            minibatch_index + 1,
                            n_train_batches,
                            test_score * 100.
                        )
                    )

                    # save the best model
                    with open('best_model.pkl', 'wb') as f:
                        pickle.dump(classifier, f)

            if patience <= iter:
                done_looping = True
                break

    end_time = timeit.default_timer()
    print(
        (
            'Optimization complete with best validation score of %f %%,'
            'with test performance %f %%'
        )
        % (best_validation_loss * 100., test_score * 100.)
    )
    print('The code run for %d epochs, with %f epochs/sec' % (
        epoch, 1. * epoch / (end_time - start_time)))
    print(('The code for file ' +
           os.path.split(__file__)[1] +
           ' ran for %.1fs' % ((end_time - start_time))), file=sys.stderr)
Beispiel #40
0
def run(args):
    import libtbx.load_env

    from dials.util import Sorry

    usage = "dials.reindex [options] indexed.expt indexed.refl"

    parser = OptionParser(
        usage=usage,
        phil=phil_scope,
        read_reflections=True,
        read_experiments=True,
        check_format=False,
        epilog=help_message,
    )

    params, options = parser.parse_args(show_diff_phil=True)

    reflections, experiments = reflections_and_experiments_from_files(
        params.input.reflections, params.input.experiments)
    if len(experiments) == 0 and len(reflections) == 0:
        parser.print_help()
        return
    if params.change_of_basis_op is None:
        raise Sorry("Please provide a change_of_basis_op.")

    reference_crystal = None
    if params.reference.experiments is not None:
        from dxtbx.serialize import load

        reference_experiments = load.experiment_list(
            params.reference.experiments, check_format=False)
        assert len(reference_experiments.crystals()) == 1
        reference_crystal = reference_experiments.crystals()[0]

    if params.reference.reflections is not None:
        # First check that we have everything as expected for the reference reindexing
        # Currently only supports reindexing one dataset at a time
        if params.reference.experiments is None:
            raise Sorry(
                """For reindexing against a reference dataset, a reference
experiments file must also be specified with the option: reference= """)
        if not os.path.exists(params.reference.reflections):
            raise Sorry("Could not locate reference dataset reflection file")
        if len(experiments) != 1 or len(reflections) != 1:
            raise Sorry(
                "Only one dataset can be reindexed to a reference at a time")

        reference_reflections = flex.reflection_table().from_file(
            params.reference.reflections)

        test_reflections = reflections[0]

        if (reference_crystal.get_space_group().type().number() !=
                experiments.crystals()[0].get_space_group().type().number()):
            raise Sorry("Space group of input does not match reference")

        # Set some flags to allow filtering, if wanting to reindex against
        # reference with data that has not yet been through integration
        if (test_reflections.get_flags(
                test_reflections.flags.integrated_sum).count(True) == 0):
            assert (
                "intensity.sum.value"
                in test_reflections), "No 'intensity.sum.value' in reflections"
            test_reflections.set_flags(
                flex.bool(test_reflections.size(), True),
                test_reflections.flags.integrated_sum,
            )
        if (reference_reflections.get_flags(
                reference_reflections.flags.integrated_sum).count(True) == 0):
            assert ("intensity.sum.value" in test_reflections
                    ), "No 'intensity.sum.value in reference reflections"
            reference_reflections.set_flags(
                flex.bool(reference_reflections.size(), True),
                reference_reflections.flags.integrated_sum,
            )

        # Make miller array of the two datasets
        try:
            test_miller_set = filtered_arrays_from_experiments_reflections(
                experiments, [test_reflections])[0]
        except ValueError:
            raise Sorry(
                "No reflections remain after filtering the test dataset")
        try:
            reference_miller_set = filtered_arrays_from_experiments_reflections(
                reference_experiments, [reference_reflections])[0]
        except ValueError:
            raise Sorry(
                "No reflections remain after filtering the reference dataset")

        from dials.algorithms.symmetry.reindex_to_reference import (
            determine_reindex_operator_against_reference, )

        change_of_basis_op = determine_reindex_operator_against_reference(
            test_miller_set, reference_miller_set)

    elif len(experiments) and params.change_of_basis_op is libtbx.Auto:
        if reference_crystal is not None:
            if len(experiments.crystals()) > 1:
                raise Sorry("Only one crystal can be processed at a time")
            from dials.algorithms.indexing.compare_orientation_matrices import (
                difference_rotation_matrix_axis_angle, )

            cryst = experiments.crystals()[0]
            R, axis, angle, change_of_basis_op = difference_rotation_matrix_axis_angle(
                cryst, reference_crystal)
            print("Change of basis op: %s" % change_of_basis_op)
            print("Rotation matrix to transform input crystal to reference::")
            print(R.mathematica_form(format="%.3f", one_row_per_line=True))
            print(
                "Rotation of %.3f degrees" % angle,
                "about axis (%.3f, %.3f, %.3f)" % axis,
            )

        elif len(reflections):
            assert len(reflections) == 1

            # always re-map reflections to reciprocal space
            refl = reflections.deep_copy()
            refl.centroid_px_to_mm(experiments)
            refl.map_centroids_to_reciprocal_space(experiments)

            # index the reflection list using the input experiments list
            refl["id"] = flex.int(len(refl), -1)
            index = AssignIndicesGlobal(tolerance=0.2)
            index(refl, experiments)
            hkl_expt = refl["miller_index"]
            hkl_input = reflections[0]["miller_index"]

            change_of_basis_op = derive_change_of_basis_op(hkl_input, hkl_expt)

            # reset experiments list since we don't want to reindex this
            experiments = []

    else:
        change_of_basis_op = sgtbx.change_of_basis_op(
            params.change_of_basis_op)

    if len(experiments):
        space_group = params.space_group
        if space_group is not None:
            space_group = space_group.group()
        experiments = reindex_experiments(experiments,
                                          change_of_basis_op,
                                          space_group=space_group)
        print("Saving reindexed experimental models to %s" %
              params.output.experiments)
        experiments.as_file(params.output.experiments)

    if len(reflections):
        assert len(reflections) == 1
        reflections = reflections[0]

        miller_indices = reflections["miller_index"]

        if params.hkl_offset is not None:
            h, k, l = miller_indices.as_vec3_double().parts()
            h += params.hkl_offset[0]
            k += params.hkl_offset[1]
            l += params.hkl_offset[2]
            miller_indices = flex.miller_index(h.iround(), k.iround(),
                                               l.iround())
        non_integral_indices = change_of_basis_op.apply_results_in_non_integral_indices(
            miller_indices)
        if non_integral_indices.size() > 0:
            print(
                "Removing %i/%i reflections (change of basis results in non-integral indices)"
                % (non_integral_indices.size(), miller_indices.size()))
        sel = flex.bool(miller_indices.size(), True)
        sel.set_selected(non_integral_indices, False)
        miller_indices_reindexed = change_of_basis_op.apply(
            miller_indices.select(sel))
        reflections["miller_index"].set_selected(sel, miller_indices_reindexed)
        reflections["miller_index"].set_selected(~sel, (0, 0, 0))

        print("Saving reindexed reflections to %s" % params.output.reflections)
        with open(params.output.reflections, "wb") as fh:
            pickle.dump(reflections, fh, protocol=pickle.HIGHEST_PROTOCOL)
Beispiel #41
0
def train(opt):
    # opt.use_att = utils.if_use_att(opt.caption_model)
    opt.use_att = True
    if opt.use_box: opt.att_feat_size = opt.att_feat_size + 5

    opt.vocab_size = 50
    opt.seq_length = 10
    opt.fc_feat_size = 100
    opt.train_true = True
    opt.train_true_step = 100
    np.random.seed(0)
    data_num = 5000
    data_features = np.random.normal(size=[data_num, opt.fc_feat_size])
    test_data_num = 1000
    test_data_features = np.random.normal(
        size=[test_data_num, opt.fc_feat_size])
    print(opt.checkpoint_path)
    tb_summary_writer = tb and tb.SummaryWriter(opt.checkpoint_path)

    infos = {}
    histories = {}
    if opt.start_from is not None:
        # open old infos and check if models are compatible
        with open(os.path.join(opt.start_from,
                               'infos_' + opt.id + '.pkl')) as f:
            infos = cPickle.load(f)
            saved_model_opt = infos['opt']
            need_be_same = [
                "caption_model", "rnn_type", "rnn_size", "num_layers"
            ]
            for checkme in need_be_same:
                assert vars(saved_model_opt)[checkme] == vars(
                    opt
                )[checkme], "Command line argument and saved model disagree on '%s' " % checkme

        if os.path.isfile(
                os.path.join(opt.start_from, 'histories_' + opt.id + '.pkl')):
            with open(
                    os.path.join(opt.start_from,
                                 'histories_' + opt.id + '.pkl')) as f:
                histories = cPickle.load(f)

    iteration = infos.get('iter', 0)
    epoch = infos.get('epoch', 0)

    val_result_history = histories.get('val_result_history', {})
    loss_history = histories.get('loss_history', {})
    critic_loss_history = histories.get('critic_loss_history', {})
    lr_history = histories.get('lr_history', {})
    ss_prob_history = histories.get('ss_prob_history', {})
    variance_history = histories.get('variance_history', {})
    time_history = histories.get('time_history', {})

    if opt.load_best_score == 1:
        best_val_score = infos.get('best_val_score', None)

    model = models.setup(opt).cuda()
    dp_model = model
    #TODO: save true model
    true_model = models.setup(opt).cuda()
    if vars(opt).get('start_from', None) is not None:
        # check if all necessary files exist
        assert os.path.isdir(
            opt.start_from), " %s must be a a path" % opt.start_from
        assert os.path.isfile(
            os.path.join(opt.start_from, "infos_" + opt.id + ".pkl")
        ), "infos.pkl file does not exist in path %s" % opt.start_from
        true_model.load_state_dict(
            torch.load(os.path.join(opt.start_from, 'truemodel.pth')))
    true_model.eval()
    ######################### Actor-critic Training #####################################################################

    update_lr_flag = True
    # Assure in training mode
    dp_model.train()

    crit = utils.LanguageModelCriterion()
    rl_crit = utils.RewardCriterion()

    optimizer = utils.build_optimizer(model.parameters(), opt)
    tm_optimizer = utils.build_optimizer(true_model.parameters(), opt)
    # Load the optimizer
    if vars(opt).get('start_from', None) is not None and os.path.isfile(
            os.path.join(opt.start_from, "optimizer.pth")):
        optimizer.load_state_dict(
            torch.load(os.path.join(opt.start_from, 'optimizer.pth')))

    first_order = 0
    second_order = 0
    while True:
        if update_lr_flag:
            # Assign the learning rate
            if epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0:
                frac = (epoch - opt.learning_rate_decay_start
                        ) // opt.learning_rate_decay_every
                decay_factor = opt.learning_rate_decay_rate**frac
                opt.current_lr = opt.learning_rate * decay_factor
            else:
                opt.current_lr = opt.learning_rate
            utils.set_lr(optimizer, opt.current_lr)
            # Assign the scheduled sampling prob
            if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0:
                frac = (epoch - opt.scheduled_sampling_start
                        ) // opt.scheduled_sampling_increase_every
                opt.ss_prob = min(opt.scheduled_sampling_increase_prob * frac,
                                  opt.scheduled_sampling_max_prob)
                model.ss_prob = opt.ss_prob

            # If start self critical training
            if opt.self_critical_after != -1 and epoch >= opt.self_critical_after:
                sc_flag = True
                init_scorer(opt.cached_tokens)
            else:
                sc_flag = False

            update_lr_flag = False

        dp_model.train()

        torch.cuda.synchronize()
        start = time.time()
        gen_result = None
        start_index = (iteration * opt.batch_size) % data_num
        end_index = start_index + opt.batch_size
        fc_feats = torch.from_numpy(
            data_features[start_index:end_index, :]).cuda().float()
        att_feats = None
        att_masks = None
        labels, total_logits = true_model(fc_feats,
                                          att_feats,
                                          att_masks,
                                          opt={'sample_max': 1},
                                          total_probs=True,
                                          mode='sample')
        labels = torch.cat(
            [torch.zeros(labels.size(0), 1).cuda().long(), labels], 1)
        masks = (labels > 0).float()

        # train true model:
        if iteration < opt.train_true_step and opt.train_true:
            tm_optimizer.zero_grad()
            loss = -((total_logits * F.softmax(total_logits, 2)).sum(2)).mean()
            loss.backward()
            tm_optimizer.step()

        optimizer.zero_grad()
        if not sc_flag:
            loss = crit(dp_model(fc_feats, att_feats, labels, att_masks),
                        labels[:, 1:], masks[:, 1:])
        else:
            if opt.rl_type == 'sc':
                gen_result, sample_logprobs = dp_model(fc_feats,
                                                       att_feats,
                                                       att_masks,
                                                       opt={'sample_max': 0},
                                                       mode='sample')
                gen_result_sc, _ = dp_model(fc_feats,
                                            att_feats,
                                            att_masks,
                                            opt={'sample_max': 1},
                                            mode='sample')
                reward = reward_fun(gen_result, fc_feats,
                                    true_model).unsqueeze(1).repeat(
                                        1, sample_logprobs.size(1))
                reward_sc = reward_fun(gen_result_sc, fc_feats,
                                       true_model).unsqueeze(1).repeat(
                                           1, sample_logprobs.size(1))
                reward = reward - reward_sc
                loss = rl_crit(sample_logprobs, gen_result.data, reward)
                reward = np.zeros([2, 2])
            elif opt.rl_type == 'reinforce':
                gen_result, sample_logprobs = dp_model(fc_feats,
                                                       att_feats,
                                                       att_masks,
                                                       opt={'sample_max': 0},
                                                       mode='sample')
                reward = reward_fun(gen_result, fc_feats,
                                    true_model).unsqueeze(1).repeat(
                                        1, sample_logprobs.size(1))
                loss = rl_crit(sample_logprobs, gen_result.data, reward)
                reward = np.zeros([2, 2])
            elif opt.rl_type == 'reinforce_demean':
                gen_result, sample_logprobs = dp_model(fc_feats,
                                                       att_feats,
                                                       att_masks,
                                                       opt={'sample_max': 0},
                                                       mode='sample')
                reward = reward_fun(gen_result, fc_feats,
                                    true_model).unsqueeze(1).repeat(
                                        1, sample_logprobs.size(1))
                loss = rl_crit(sample_logprobs, gen_result.data,
                               reward - reward.mean())
                reward = np.zeros([2, 2])
            elif opt.rl_type == 'arsm':
                loss = get_arm_loss(dp_model, fc_feats, att_feats, att_masks,
                                    true_model, opt)
                #print(loss)
                reward = np.zeros([2, 2])
            elif opt.rl_type == 'ars':
                loss = get_arm_loss(dp_model,
                                    fc_feats,
                                    att_feats,
                                    att_masks,
                                    true_model,
                                    opt,
                                    type='ars')
                #print(loss)
                reward = np.zeros([2, 2])
            elif opt.rl_type == 'ar':
                loss = get_ar_loss(dp_model, fc_feats, att_feats, att_masks,
                                   true_model, opt)
                # print(loss)
                reward = np.zeros([2, 2])
            elif opt.rl_type == 'mct_baseline':
                opt.rf_demean = 0
                gen_result, sample_logprobs, probs, mct_baseline = get_mct_loss(
                    dp_model, fc_feats, att_feats, att_masks, opt, true_model)
                reward = reward_fun(gen_result, fc_feats,
                                    true_model).unsqueeze(1).repeat(
                                        1, sample_logprobs.size(1))
                reward_cuda = reward
                #mct_baseline[mct_baseline < 0] = reward_cuda[mct_baseline < 0]
                loss = rl_crit(sample_logprobs, gen_result.data,
                               reward - mct_baseline)
        if opt.mle_weights != 0:
            loss += opt.mle_weights * crit(
                dp_model(fc_feats, att_feats, labels, att_masks),
                labels[:, 1:], masks[:, 1:])
        #TODO make sure all sampling replaced by greedy for critic
        #### update the actor
        loss.backward()
        # with open(os.path.join(opt.checkpoint_path, 'best_embed.pkl'), 'wb') as f:
        #     cPickle.dump(list(dp_model.embed.parameters())[0].data.cpu().numpy(), f)
        # with open(os.path.join(opt.checkpoint_path, 'best_logit.pkl'), 'wb') as f:
        #     cPickle.dump(list(dp_model.logit.parameters())[0].data.cpu().numpy(), f)
        ## compute variance
        gradient = torch.zeros([0]).cuda()
        for i in model.parameters():
            gradient = torch.cat((gradient, i.grad.view(-1)), 0)
        first_order = 0.9999 * first_order + 0.0001 * gradient
        second_order = 0.9999 * second_order + 0.0001 * gradient.pow(2)
        # print(torch.max(torch.abs(gradient)))
        variance = torch.mean(torch.abs(second_order -
                                        first_order.pow(2))).item()
        if opt.rl_type != 'arsm' or not sc_flag:
            utils.clip_gradient(optimizer, opt.grad_clip)
        optimizer.step()
        train_loss = loss.item()
        torch.cuda.synchronize()
        end = time.time()
        if (iteration % opt.losses_log_every == 0):
            if not sc_flag:
                print("iter {} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \
                    .format(iteration, epoch, train_loss, end - start))
                print(opt.checkpoint_path)
            else:
                print("iter {} (epoch {}), avg_reward = {:.3f}, variance = {:g}, time/batch = {:.3f}" \
                      .format(iteration, epoch, reward.mean(), variance, end - start))

        # Update the iteration and epoch
        iteration += 1
        if (iteration * opt.batch_size) % data_num == 0:
            epoch += 1
            update_lr_flag = True

        # Write the training loss summary
        if (iteration % opt.losses_log_every == 0):
            add_summary_value(tb_summary_writer, 'train_loss', train_loss,
                              iteration)
            add_summary_value(tb_summary_writer, 'learning_rate',
                              opt.current_lr, iteration)
            add_summary_value(tb_summary_writer, 'scheduled_sampling_prob',
                              model.ss_prob, iteration)
            if sc_flag:
                add_summary_value(tb_summary_writer, 'avg_reward',
                                  reward.mean(), iteration)
                add_summary_value(tb_summary_writer, 'variance', variance,
                                  iteration)

            #loss_history[iteration] = train_loss if not sc_flag else reward.mean()
            lr_history[iteration] = opt.current_lr
            ss_prob_history[iteration] = model.ss_prob
            variance_history[iteration] = variance
            time_history[iteration] = end - start

        # make evaluation on validation set, and save model
        if (iteration % opt.save_checkpoint_every == 0):
            # eval model

            val_loss, lang_stats = eval_utils_syn(dp_model, true_model,
                                                  test_data_features,
                                                  opt.batch_size, crit)

            lang_stats = lang_stats.item()
            val_loss = val_loss.item()
            # Write validation result into summary
            add_summary_value(tb_summary_writer, 'validation loss', val_loss,
                              iteration)
            val_result_history[iteration] = {
                'loss': val_loss,
                'lang_stats': lang_stats
            }
            # Save model if is improving on validation result
            print('loss', val_loss, 'lang_stats', lang_stats)
            if True:  # if true
                checkpoint_path = os.path.join(opt.checkpoint_path,
                                               'model.pth')
                if not os.path.isdir(opt.checkpoint_path):
                    os.mkdir(opt.checkpoint_path)
                torch.save(model.state_dict(), checkpoint_path)
                checkpoint_path = os.path.join(opt.checkpoint_path,
                                               'truemodel.pth')
                torch.save(true_model.state_dict(), checkpoint_path)
                print("model saved to {}".format(checkpoint_path))
                optimizer_path = os.path.join(opt.checkpoint_path,
                                              'optimizer.pth')
                torch.save(optimizer.state_dict(), optimizer_path)
                # Dump miscalleous informations
                infos['iter'] = iteration
                infos['epoch'] = epoch
                infos['best_val_score'] = best_val_score
                infos['opt'] = opt
                infos['vocab'] = opt.vocab_size
                histories['val_result_history'] = val_result_history
                histories['loss_history'] = loss_history
                histories['critic_loss_history'] = critic_loss_history
                histories['lr_history'] = lr_history
                histories['ss_prob_history'] = ss_prob_history
                histories['variance_history'] = variance_history
                histories['time'] = time_history
                # histories['variance'] = 0
                with open(
                        os.path.join(opt.checkpoint_path,
                                     'infos_' + opt.id + '.pkl'), 'wb') as f:
                    cPickle.dump(infos, f)
                with open(
                        os.path.join(opt.checkpoint_path,
                                     'histories_' + opt.id + '.pkl'),
                        'wb') as f:
                    cPickle.dump(histories, f)

        # Stop if reaching max epochs
        if epoch >= opt.max_epochs and opt.max_epochs != -1:
            break
Beispiel #42
0
def pickle_environment(path, environment=None):
    """Pickle an environment dictionary to a file."""
    cPickle.dump(dict(environment if environment else os.environ),
                 open(path, 'wb'),
                 protocol=2)
Beispiel #43
0
    def read_doc(self, docname, app=None):
        """Parse a file and add/update inventory entries for the doctree."""

        self.temp_data['docname'] = docname
        # defaults to the global default, but can be re-set in a document
        self.temp_data['default_domain'] = \
            self.domains.get(self.config.primary_domain)

        self.settings['input_encoding'] = self.config.source_encoding
        self.settings['trim_footnote_reference_space'] = \
            self.config.trim_footnote_reference_space
        self.settings['gettext_compact'] = self.config.gettext_compact

        docutilsconf = path.join(self.srcdir, 'docutils.conf')
        # read docutils.conf from source dir, not from current dir
        OptionParser.standard_config_files[1] = docutilsconf
        if path.isfile(docutilsconf):
            self.note_dependency(docutilsconf)

        with sphinx_domains(self):
            if self.config.default_role:
                role_fn, messages = roles.role(self.config.default_role, english,
                                               0, dummy_reporter)
                if role_fn:
                    roles._roles[''] = role_fn
                else:
                    self.warn(docname, 'default role %s not found' %
                              self.config.default_role)

            codecs.register_error('sphinx', self.warn_and_replace)

            # publish manually
            reader = SphinxStandaloneReader(self.app, parsers=self.config.source_parsers)
            pub = Publisher(reader=reader,
                            writer=SphinxDummyWriter(),
                            destination_class=NullOutput)
            pub.set_components(None, 'restructuredtext', None)
            pub.process_programmatic_settings(None, self.settings, None)
            src_path = self.doc2path(docname)
            source = SphinxFileInput(app, self, source=None, source_path=src_path,
                                     encoding=self.config.source_encoding)
            pub.source = source
            pub.settings._source = src_path
            pub.set_destination(None, None)
            pub.publish()
            doctree = pub.document

        # post-processing
        self.process_dependencies(docname, doctree)
        self.process_images(docname, doctree)
        self.process_downloads(docname, doctree)
        self.process_metadata(docname, doctree)
        self.create_title_from(docname, doctree)
        for manager in itervalues(self.managers):
            manager.process_doc(docname, doctree)
        for domain in itervalues(self.domains):
            domain.process_doc(self, docname, doctree)

        # allow extension-specific post-processing
        if app:
            app.emit('doctree-read', doctree)

        # store time of reading, for outdated files detection
        # (Some filesystems have coarse timestamp resolution;
        # therefore time.time() can be older than filesystem's timestamp.
        # For example, FAT32 has 2sec timestamp resolution.)
        self.all_docs[docname] = max(
            time.time(), path.getmtime(self.doc2path(docname)))

        if self.versioning_condition:
            old_doctree = None
            if self.versioning_compare:
                # get old doctree
                try:
                    with open(self.doc2path(docname,
                                            self.doctreedir, '.doctree'), 'rb') as f:
                        old_doctree = pickle.load(f)
                except EnvironmentError:
                    pass

            # add uids for versioning
            if not self.versioning_compare or old_doctree is None:
                list(add_uids(doctree, self.versioning_condition))
            else:
                list(merge_doctrees(
                    old_doctree, doctree, self.versioning_condition))

        # make it picklable
        doctree.reporter = None
        doctree.transformer = None
        doctree.settings.warning_stream = None
        doctree.settings.env = None
        doctree.settings.record_dependencies = None

        # cleanup
        self.temp_data.clear()
        self.ref_context.clear()
        roles._roles.pop('', None)  # if a document has set a local default role

        # save the parsed doctree
        doctree_filename = self.doc2path(docname, self.doctreedir,
                                         '.doctree')
        ensuredir(path.dirname(doctree_filename))
        with open(doctree_filename, 'wb') as f:
            pickle.dump(doctree, f, pickle.HIGHEST_PROTOCOL)
Beispiel #44
0
 def save(self, file_name):
     f = open(file_name, 'wb')
     pickle.dump(self.weights, f, pickle.HIGHEST_PROTOCOL)
     f.close()
Beispiel #45
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    albert_config = modeling.AlbertConfig.from_json_file(
        FLAGS.albert_config_file)

    validate_flags_or_throw(albert_config)

    tf.gfile.MakeDirs(FLAGS.output_dir)

    tokenizer = fine_tuning_utils.create_vocab(
        vocab_file=FLAGS.vocab_file,
        do_lower_case=FLAGS.do_lower_case,
        spm_model_file=FLAGS.spm_model_file,
        hub_module=FLAGS.albert_hub_module_handle)

    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2
    if FLAGS.do_train:
        iterations_per_loop = int(
            min(FLAGS.iterations_per_loop, FLAGS.save_checkpoints_steps))
    else:
        iterations_per_loop = FLAGS.iterations_per_loop
    run_config = contrib_tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        keep_checkpoint_max=0,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        tpu_config=contrib_tpu.TPUConfig(
            iterations_per_loop=iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host))

    train_examples = None
    num_train_steps = None
    num_warmup_steps = None
    train_examples = squad_utils.read_squad_examples(
        input_file=FLAGS.train_file, is_training=True)
    num_train_steps = int(
        len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs)
    if FLAGS.do_train:
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

        # Pre-shuffle the input to avoid having to make a very large shuffle
        # buffer in in the `input_fn`.
        rng = random.Random(12345)
        rng.shuffle(train_examples)

    model_fn = squad_utils.v2_model_fn_builder(
        albert_config=albert_config,
        init_checkpoint=FLAGS.init_checkpoint,
        learning_rate=FLAGS.learning_rate,
        num_train_steps=num_train_steps,
        num_warmup_steps=num_warmup_steps,
        use_tpu=FLAGS.use_tpu,
        use_one_hot_embeddings=FLAGS.use_tpu,
        max_seq_length=FLAGS.max_seq_length,
        start_n_top=FLAGS.start_n_top,
        end_n_top=FLAGS.end_n_top,
        dropout_prob=FLAGS.dropout_prob,
        hub_module=FLAGS.albert_hub_module_handle)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = contrib_tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        predict_batch_size=FLAGS.predict_batch_size)

    if FLAGS.do_train:
        # We write to a temporary file to avoid storing very large constant tensors
        # in memory.

        if not tf.gfile.Exists(FLAGS.train_feature_file):
            train_writer = squad_utils.FeatureWriter(filename=os.path.join(
                FLAGS.train_feature_file),
                                                     is_training=True)
            squad_utils.convert_examples_to_features(
                examples=train_examples,
                tokenizer=tokenizer,
                max_seq_length=FLAGS.max_seq_length,
                doc_stride=FLAGS.doc_stride,
                max_query_length=FLAGS.max_query_length,
                is_training=True,
                output_fn=train_writer.process_feature,
                do_lower_case=FLAGS.do_lower_case)
            train_writer.close()

        tf.logging.info("***** Running training *****")
        tf.logging.info("  Num orig examples = %d", len(train_examples))
        # tf.logging.info("  Num split examples = %d", train_writer.num_features)
        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        tf.logging.info("  Num steps = %d", num_train_steps)
        del train_examples

        train_input_fn = squad_utils.input_fn_builder(
            input_file=FLAGS.train_feature_file,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True,
            use_tpu=FLAGS.use_tpu,
            bsz=FLAGS.train_batch_size,
            is_v2=True)
        estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

    if FLAGS.do_predict:
        with tf.gfile.Open(FLAGS.predict_file) as predict_file:
            prediction_json = json.load(predict_file)["data"]
        eval_examples = squad_utils.read_squad_examples(
            input_file=FLAGS.predict_file, is_training=False)

        if (tf.gfile.Exists(FLAGS.predict_feature_file)
                and tf.gfile.Exists(FLAGS.predict_feature_left_file)):
            tf.logging.info("Loading eval features from {}".format(
                FLAGS.predict_feature_left_file))
            with tf.gfile.Open(FLAGS.predict_feature_left_file, "rb") as fin:
                eval_features = pickle.load(fin)
        else:
            eval_writer = squad_utils.FeatureWriter(
                filename=FLAGS.predict_feature_file, is_training=False)
            eval_features = []

            def append_feature(feature):
                eval_features.append(feature)
                eval_writer.process_feature(feature)

            squad_utils.convert_examples_to_features(
                examples=eval_examples,
                tokenizer=tokenizer,
                max_seq_length=FLAGS.max_seq_length,
                doc_stride=FLAGS.doc_stride,
                max_query_length=FLAGS.max_query_length,
                is_training=False,
                output_fn=append_feature,
                do_lower_case=FLAGS.do_lower_case)
            eval_writer.close()

            with tf.gfile.Open(FLAGS.predict_feature_left_file, "wb") as fout:
                pickle.dump(eval_features, fout)

        tf.logging.info("***** Running predictions *****")
        tf.logging.info("  Num orig examples = %d", len(eval_examples))
        tf.logging.info("  Num split examples = %d", len(eval_features))
        tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)

        predict_input_fn = squad_utils.input_fn_builder(
            input_file=FLAGS.predict_feature_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=False,
            use_tpu=FLAGS.use_tpu,
            bsz=FLAGS.predict_batch_size,
            is_v2=True)

        def get_result(checkpoint):
            """Evaluate the checkpoint on SQuAD v2.0."""
            # If running eval on the TPU, you will need to specify the number of
            # steps.
            reader = tf.train.NewCheckpointReader(checkpoint)
            global_step = reader.get_tensor(tf.GraphKeys.GLOBAL_STEP)
            all_results = []
            for result in estimator.predict(predict_input_fn,
                                            yield_single_examples=True,
                                            checkpoint_path=checkpoint):
                if len(all_results) % 1000 == 0:
                    tf.logging.info("Processing example: %d" %
                                    (len(all_results)))
                unique_id = int(result["unique_ids"])
                start_top_log_probs = ([
                    float(x) for x in result["start_top_log_probs"].flat
                ])
                start_top_index = [
                    int(x) for x in result["start_top_index"].flat
                ]
                end_top_log_probs = ([
                    float(x) for x in result["end_top_log_probs"].flat
                ])
                end_top_index = [int(x) for x in result["end_top_index"].flat]

                cls_logits = float(result["cls_logits"].flat[0])
                all_results.append(
                    squad_utils.RawResultV2(
                        unique_id=unique_id,
                        start_top_log_probs=start_top_log_probs,
                        start_top_index=start_top_index,
                        end_top_log_probs=end_top_log_probs,
                        end_top_index=end_top_index,
                        cls_logits=cls_logits))

            output_prediction_file = os.path.join(FLAGS.output_dir,
                                                  "predictions.json")
            output_nbest_file = os.path.join(FLAGS.output_dir,
                                             "nbest_predictions.json")
            output_null_log_odds_file = os.path.join(FLAGS.output_dir,
                                                     "null_odds.json")

            result_dict = {}
            cls_dict = {}
            squad_utils.accumulate_predictions_v2(
                result_dict, cls_dict, eval_examples, eval_features,
                all_results, FLAGS.n_best_size, FLAGS.max_answer_length,
                FLAGS.start_n_top, FLAGS.end_n_top)

            return squad_utils.evaluate_v2(
                result_dict, cls_dict, prediction_json, eval_examples,
                eval_features, all_results, FLAGS.n_best_size,
                FLAGS.max_answer_length, output_prediction_file,
                output_nbest_file, output_null_log_odds_file), int(global_step)

        def _find_valid_cands(curr_step):
            filenames = tf.gfile.ListDirectory(FLAGS.output_dir)
            candidates = []
            for filename in filenames:
                if filename.endswith(".index"):
                    ckpt_name = filename[:-6]
                    idx = ckpt_name.split("-")[-1]
                    if idx != "best" and int(idx) > curr_step:
                        candidates.append(filename)
            return candidates

        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best")
        key_name = "f1"
        writer = tf.gfile.GFile(output_eval_file, "w")
        if tf.gfile.Exists(checkpoint_path + ".index"):
            result = get_result(checkpoint_path)
            best_perf = result[0][key_name]
            global_step = result[1]
        else:
            global_step = -1
            best_perf = -1
            checkpoint_path = None
        while global_step < num_train_steps:
            steps_and_files = {}
            filenames = tf.gfile.ListDirectory(FLAGS.output_dir)
            for filename in filenames:
                if filename.endswith(".index"):
                    ckpt_name = filename[:-6]
                    cur_filename = os.path.join(FLAGS.output_dir, ckpt_name)
                    if cur_filename.split("-")[-1] == "best":
                        continue
                    gstep = int(cur_filename.split("-")[-1])
                    if gstep not in steps_and_files:
                        tf.logging.info(
                            "Add {} to eval list.".format(cur_filename))
                        steps_and_files[gstep] = cur_filename
            tf.logging.info("found {} files.".format(len(steps_and_files)))
            if not steps_and_files:
                tf.logging.info(
                    "found 0 file, global step: {}. Sleeping.".format(
                        global_step))
                time.sleep(60)
            else:
                for ele in sorted(steps_and_files.items()):
                    step, checkpoint_path = ele
                    if global_step >= step:
                        if len(_find_valid_cands(step)) > 1:
                            for ext in [
                                    "meta", "data-00000-of-00001", "index"
                            ]:
                                src_ckpt = checkpoint_path + ".{}".format(ext)
                                tf.logging.info("removing {}".format(src_ckpt))
                                tf.gfile.Remove(src_ckpt)
                        continue
                    result, global_step = get_result(checkpoint_path)
                    tf.logging.info("***** Eval results *****")
                    for key in sorted(result.keys()):
                        tf.logging.info("  %s = %s", key, str(result[key]))
                        writer.write("%s = %s\n" % (key, str(result[key])))
                    if result[key_name] > best_perf:
                        best_perf = result[key_name]
                        for ext in ["meta", "data-00000-of-00001", "index"]:
                            src_ckpt = checkpoint_path + ".{}".format(ext)
                            tgt_ckpt = checkpoint_path.rsplit(
                                "-", 1)[0] + "-best.{}".format(ext)
                            tf.logging.info("saving {} to {}".format(
                                src_ckpt, tgt_ckpt))
                            tf.gfile.Copy(src_ckpt, tgt_ckpt, overwrite=True)
                            writer.write("saved {} to {}\n".format(
                                src_ckpt, tgt_ckpt))
                    writer.write("best {} = {}\n".format(key_name, best_perf))
                    tf.logging.info("  best {} = {}\n".format(
                        key_name, best_perf))

                    if len(_find_valid_cands(global_step)) > 2:
                        for ext in ["meta", "data-00000-of-00001", "index"]:
                            src_ckpt = checkpoint_path + ".{}".format(ext)
                            tf.logging.info("removing {}".format(src_ckpt))
                            tf.gfile.Remove(src_ckpt)
                    writer.write("=" * 50 + "\n")

        checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best")
        result, global_step = get_result(checkpoint_path)
        tf.logging.info("***** Final Eval results *****")
        for key in sorted(result.keys()):
            tf.logging.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))
        writer.write("best perf happened at step: {}".format(global_step))
Beispiel #46
0
def train(opt):
    opt.use_att = utils.if_use_att(opt.caption_model)
    loader = DataLoader(opt)
    opt.vocab_size = loader.vocab_size
    opt.seq_length = loader.seq_length

    tf_summary_writer = tf and tf.summary.FileWriter(opt.checkpoint_path)

    infos = {}
    histories = {}
    if opt.start_from is not None:
        # open old infos and check if models are compatible
        with open(os.path.join(opt.start_from, 'infos_'+opt.old_id+'.pkl')) as f:
            infos = cPickle.load(f)
            saved_model_opt = infos['opt']
            need_be_same=["rnn_type", "rnn_size", "num_layers"]
            for checkme in need_be_same:
                assert vars(saved_model_opt)[checkme] == vars(opt)[checkme], "Command line argument and saved model disagree on '%s' " % checkme

        if os.path.isfile(os.path.join(opt.start_from, 'histories_'+opt.old_id+'.pkl')):
            with open(os.path.join(opt.start_from, 'histories_'+opt.old_id+'.pkl')) as f:
                histories = cPickle.load(f)

    iteration = infos.get('iter', 0)
    epoch = infos.get('epoch', 0)

    val_result_history = histories.get('val_result_history', {})
    loss_history = histories.get('loss_history', {})
    lr_history = histories.get('lr_history', {})
    ss_prob_history = histories.get('ss_prob_history', {})

    loader.iterators = infos.get('iterators', loader.iterators)
    loader.split_ix = infos.get('split_ix', loader.split_ix)
    loader.syn_iterator_all()
    if opt.load_best_score == 1:
        best_val_score = infos.get('best_val_score', None)

    model = models.setup(opt)
    model.cuda()

    if opt.gpu_num > 1 :
        model_ = torch.nn.DataParallel(model, device_ids=range(opt.gpu_num))
    else :
        model_ = model
    update_lr_flag = True
    # Assure in training mode
    model.train()

    crit = utils.LanguageModelCriterion()
    rl_crit = utils.RewardCriterion()

    optimizer = optim.Adam(model.parameters(), lr=opt.learning_rate, weight_decay=opt.weight_decay)

    optimizer.zero_grad()
    # Load the optimizer
    if vars(opt).get('start_from', None) is not None and os.path.isfile(os.path.join(opt.start_from,"optimizer.pth")):
        optimizer.load_state_dict(torch.load(os.path.join(opt.start_from, 'optimizer.pth')))

    while True:
        # make evaluation on validation set, and save model
        if (update_lr_flag):
            # eval model
            eval_kwargs = {'split': 'val',
                            'dataset': opt.input_json}
            eval_kwargs.update(vars(opt))
            val_loss, predictions, lang_stats = eval_utils_t.eval_split(None, model, crit, loader, eval_kwargs)

            # Write validation result into summary
            if tf is not None:
                add_summary_value(tf_summary_writer, 'validation loss', val_loss, iteration)
                for k,v in lang_stats.items():
                    add_summary_value(tf_summary_writer, k, v, iteration)
                tf_summary_writer.flush()
            val_result_history[iteration] = {'loss': val_loss, 'lang_stats': lang_stats, 'predictions': predictions}

            # Save model if is improving on validation result
            if opt.language_eval == 1:
                current_score = lang_stats['CIDEr']
            else:
                current_score = - val_loss

            best_flag = False
            if True: # if true
                if best_val_score is None or current_score > best_val_score:
                    best_val_score = current_score
                    best_flag = True
                checkpoint_path = os.path.join(opt.checkpoint_path, 'model.pth')
                torch.save(model.state_dict(), checkpoint_path)
                print("model saved to {}".format(checkpoint_path))
                optimizer_path = os.path.join(opt.checkpoint_path, 'optimizer.pth')
                torch.save(optimizer.state_dict(), optimizer_path)

                # Dump miscalleous informations
                infos['iter'] = iteration
                infos['epoch'] = epoch
                infos['iterators'] = loader.iterators
                infos['split_ix'] = loader.split_ix
                infos['best_val_score'] = best_val_score
                infos['opt'] = opt
                infos['vocab'] = loader.get_vocab()

                histories['val_result_history'] = val_result_history
                histories['loss_history'] = loss_history
                histories['lr_history'] = lr_history
                histories['ss_prob_history'] = ss_prob_history
                with open(os.path.join(opt.checkpoint_path, 'infos_'+opt.id+'.pkl'), 'wb') as f:
                    cPickle.dump(infos, f)
                with open(os.path.join(opt.checkpoint_path, 'histories_'+opt.id+'.pkl'), 'wb') as f:
                    cPickle.dump(histories, f)

                if best_flag:
                    checkpoint_path = os.path.join(opt.checkpoint_path, 'model-best.pth')
                    torch.save(model.state_dict(), checkpoint_path)
                    print("model saved to {}".format(checkpoint_path))
                    with open(os.path.join(opt.checkpoint_path, 'infos_'+opt.id+'-best.pkl'), 'wb') as f:
                        cPickle.dump(infos, f)

        if update_lr_flag:
                # Assign the learning rate
            if epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0:
                frac = (epoch - opt.learning_rate_decay_start) // opt.learning_rate_decay_every
                decay_factor = opt.learning_rate_decay_rate  ** frac
                opt.current_lr = opt.learning_rate * decay_factor
                utils.set_lr(optimizer, opt.current_lr) # set the decayed rate
            else:
                opt.current_lr = opt.learning_rate
            # Assign the scheduled sampling prob
            if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0:
                frac = (epoch - opt.scheduled_sampling_start) // opt.scheduled_sampling_increase_every
                opt.ss_prob = min(opt.scheduled_sampling_increase_prob  * frac, opt.scheduled_sampling_max_prob)
                model.ss_prob = opt.ss_prob

            # If start self critical training
            if opt.self_critical_after != -1 and epoch >= opt.self_critical_after:
                sc_flag = True
                init_cider_scorer(opt.cached_tokens)
            else:
                sc_flag = False

            update_lr_flag = False

        # Stop if reaching max epochs
        if epoch >= opt.max_epochs and opt.max_epochs != -1:
            break

        start = time.time()
        # Load data from train split (0)
        data = loader.get_batch('train')
        print('Read data:', time.time() - start)

        torch.cuda.synchronize()
        start = time.time()

        tmp = [data['fc_feats'], data['att_feats'], data['labels'], data['masks']]
        tmp = [Variable(torch.from_numpy(_), requires_grad=False).cuda() for _ in tmp]
        fc_feats, att_feats, labels, masks = tmp
        if opt.use_topic:
            topics = Variable(torch.from_numpy(data['topics']), requires_grad=False).cuda()
            if not sc_flag:
                loss = crit(model_(fc_feats, att_feats, topics, labels), labels[:,1:], masks[:,1:])
            else:
                gen_result, sample_logprobs = model.sample(fc_feats, att_feats, topics, {'sample_max':0})
                reward, base_cider, explore_cider = get_self_critical_reward_t(model, fc_feats, att_feats, topics, data, gen_result)
                loss = rl_crit(sample_logprobs, gen_result, Variable(torch.from_numpy(reward).float().cuda(), requires_grad=False))
        else:
            if not sc_flag:
                loss = crit(model_(fc_feats, att_feats, labels), labels[:,1:], masks[:,1:])
            else:
                gen_result, sample_logprobs = model.sample(fc_feats, att_feats, {'sample_max':0})
                reward, base_cider, explore_cider = get_self_critical_reward(model, fc_feats, att_feats, data, gen_result)
                loss = rl_crit(sample_logprobs, gen_result, Variable(torch.from_numpy(reward).float().cuda(), requires_grad=False))
        loss_ = loss / opt.iter_times
        loss_.backward()
        if (iteration + 1) % opt.iter_times == 0:
            utils.clip_gradient(optimizer, opt.grad_clip)
            optimizer.step()
            optimizer.zero_grad()
        train_loss = loss.data[0]
        torch.cuda.synchronize()
        end = time.time()
        if iteration % 25 == 0 :
            if not sc_flag:
                print("iter {} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \
                    .format(iteration, epoch, train_loss, end - start))
            else:
                print("iter {} (epoch {}), avg_reward = {:.3f}, base_cider = {:.3f}, explore_cider = {:.3f},  time/batch = {:.3f}" \
                    .format(iteration, epoch, np.mean(reward[:,0]), base_cider, explore_cider, end - start))

        # Update the iteration and epoch
        iteration += 1
        if data['bounds']['wrapped']:
            epoch += 1
            update_lr_flag = True
            loader.reset_iterator('train')

        # Write the training loss summary
        if (iteration % opt.losses_log_every == 0):
            if tf is not None:
                add_summary_value(tf_summary_writer, 'train_loss', train_loss, iteration)
                add_summary_value(tf_summary_writer, 'learning_rate', opt.current_lr, iteration)
                add_summary_value(tf_summary_writer, 'scheduled_sampling_prob', model.ss_prob, iteration)
                if sc_flag:
                    add_summary_value(tf_summary_writer, 'avg_reward', np.mean(reward[:,0]), iteration)
                tf_summary_writer.flush()

            loss_history[iteration] = train_loss if not sc_flag else np.mean(reward[:,0])
            lr_history[iteration] = opt.current_lr
            ss_prob_history[iteration] = model.ss_prob
Beispiel #47
0
##########################################

pickle_file = 'notMNIST.pickle'

try:
  f = open(pickle_file, 'wb')
  save = {
    'train_dataset': train_dataset,
    'train_labels': train_labels,
    'valid_dataset': valid_dataset,
    'valid_labels': valid_labels,
    'test_dataset': test_dataset,
    'test_labels': test_labels,
    }
  pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
  f.close()
except Exception as e:
  print('Unable to save data to', pickle_file, ':', e)
  raise
  
statinfo = os.stat(pickle_file)
print('Compressed pickle size:', statinfo.st_size)
###############################################3  
#prune data, delete repeated samples in training, test and validation sets
models = np.unique(train_labels)
idx_tr    = np.arange(train_labels.size); c_tr = np.zeros(train_labels.size);
idx_va    = np.arange(valid_labels.size); c_va = np.zeros(valid_labels.size);
idx_te    = np.arange(test_labels.size);  c_te = np.zeros(test_labels.size);

for mm in models:   
Beispiel #48
0
        print "Rank: {}, Hidden Layer Size: {}".format(
            compute_effective_rank(svals), svals.shape[0])

    if isinstance(batch_size, int):
        if batch_size == 1:
            fig_outfile = 'perf_mlp_seque.png'
        else:
            fig_outfile = 'perf_mlp_minibatchsize_%d.png' % batch_size
    else:
        fig_outfile = 'perf_mlp_batch.png'

    if raw_input("Shall we save this model? (y/n)\n") == 'y':
        model_outfile = fig_outfile.split('.')[0] + ".pkl"
        fobj = open(model_outfile, 'wb')
        cPickle.dump(predict, fobj, protocol=cPickle.HIGHEST_PROTOCOL)
        fobj.close()

    if raw_input('Save training figure? (y/n): \n') == 'y':
        performanceplot(cost_record, tr_err_record, te_err_record,
                        "contrast_" + fig_outfile)

else:
    model_outfile = raw_input("Provide path to model_outfile: \n")
    fobj = open(model_outfile, 'rb')
    cPickle.load(fobj)
    fobj.close()

if raw_input("Perform failure analysis? (y/n):\n") == 'y':
    failure_analysis.investigate_mlp(teX, teY, predict(teX) > 0.5)
Beispiel #49
0
# Mapping from index to word : that's the vocabulary
vocabulary_inv = [x[0] for x in word_counts.most_common()]
vocabulary_inv = list(sorted(vocabulary_inv))

# Mapping from word to index
vocab = {x: i for i, x in enumerate(vocabulary_inv)}
words = [x[0] for x in word_counts.most_common()]

#size of the vocabulary
vocab_size = len(words)
print("vocab size: ", vocab_size)

#save the words and vocabulary
with open(os.path.join(vocab_file), 'wb') as f:
    cPickle.dump((words, vocab, vocabulary_inv), f)

#create sequences
sequences = []
next_words = []
for i in range(0, len(wordlist) - seq_length, sequences_step):
    sequences.append(wordlist[i:i + seq_length])
    next_words.append(wordlist[i + seq_length])

print('nb sequences:', len(sequences))

X = np.zeros((len(sequences), seq_length, vocab_size), dtype=np.bool)
y = np.zeros((len(sequences), vocab_size), dtype=np.bool)
for i, sentence in enumerate(sequences):
    for t, word in enumerate(sentence):
        X[i, t, vocab[word]] = 1
Beispiel #50
0
def train(opt):
    # Deal with feature things before anything
    opt.use_att = utils.if_use_att(opt.caption_model)
    if opt.use_box: opt.att_feat_size = opt.att_feat_size + 5

    loader = DataLoader(opt)
    opt.vocab_size = loader.vocab_size
    opt.seq_length = loader.seq_length

    tb_summary_writer = tb and tb.SummaryWriter(opt.checkpoint_path)

    infos = {}
    histories = {}
    if opt.start_from is not None:
        # open old infos and check if models are compatible
        with open(os.path.join(opt.start_from, 'infos_'+opt.id+'.pkl')) as f:
            infos = cPickle.load(f)
            saved_model_opt = infos['opt']
            need_be_same=["caption_model", "rnn_type", "rnn_size", "num_layers"]
            for checkme in need_be_same:
                assert vars(saved_model_opt)[checkme] == vars(opt)[checkme], "Command line argument and saved model disagree on '%s' " % checkme

        if os.path.isfile(os.path.join(opt.start_from, 'histories_'+opt.id+'.pkl')):
            with open(os.path.join(opt.start_from, 'histories_'+opt.id+'.pkl')) as f:
                histories = cPickle.load(f)

    iteration = infos.get('iter', 0)
    epoch = infos.get('epoch', 0)

    val_result_history = histories.get('val_result_history', {})
    loss_history = histories.get('loss_history', {})
    lr_history = histories.get('lr_history', {})
    ss_prob_history = histories.get('ss_prob_history', {})

    loader.iterators = infos.get('iterators', loader.iterators)
    loader.split_ix = infos.get('split_ix', loader.split_ix)
    if opt.load_best_score == 1:
        best_val_score = infos.get('best_val_score', None)

    model = models.setup(opt).cuda()
    dp_model = torch.nn.DataParallel(model)

    epoch_done = True
    # Assure in training mode
    dp_model.train()

    if opt.label_smoothing > 0:
        crit = utils.LabelSmoothing(smoothing=opt.label_smoothing)
    else:
        crit = utils.LanguageModelCriterion()
    rl_crit = utils.RewardCriterion()

    if opt.noamopt:
        assert opt.caption_model == 'transformer', 'noamopt can only work with transformer'
        optimizer = utils.get_std_opt(model, factor=opt.noamopt_factor, warmup=opt.noamopt_warmup)
        optimizer._step = iteration
    elif opt.reduce_on_plateau:
        optimizer = utils.build_optimizer(model.parameters(), opt)
        optimizer = utils.ReduceLROnPlateau(optimizer, factor=0.5, patience=3)
    else:
        optimizer = utils.build_optimizer(model.parameters(), opt)
    # Load the optimizer
    if vars(opt).get('start_from', None) is not None and os.path.isfile(os.path.join(opt.start_from,"optimizer.pth")):
        optimizer.load_state_dict(torch.load(os.path.join(opt.start_from, 'optimizer.pth')))

    while True:
        if epoch_done:
            if not opt.noamopt and not opt.reduce_on_plateau:
                # Assign the learning rate
                if epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0:
                    frac = (epoch - opt.learning_rate_decay_start) // opt.learning_rate_decay_every
                    decay_factor = opt.learning_rate_decay_rate  ** frac
                    opt.current_lr = opt.learning_rate * decay_factor
                else:
                    opt.current_lr = opt.learning_rate
                utils.set_lr(optimizer, opt.current_lr) # set the decayed rate
            # Assign the scheduled sampling prob
            if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0:
                frac = (epoch - opt.scheduled_sampling_start) // opt.scheduled_sampling_increase_every
                opt.ss_prob = min(opt.scheduled_sampling_increase_prob  * frac, opt.scheduled_sampling_max_prob)
                model.ss_prob = opt.ss_prob

            # If start self critical training
            if opt.self_critical_after != -1 and epoch >= opt.self_critical_after:
                sc_flag = True
                init_scorer(opt.cached_tokens)
            else:
                sc_flag = False

            epoch_done = False
                
        start = time.time()
        # Load data from train split (0)
        data = loader.get_batch('train')
        print('Read data:', time.time() - start)

        torch.cuda.synchronize()
        start = time.time()

        tmp = [data['fc_feats'], data['att_feats'], data['labels'], data['masks'], data['att_masks']]
        tmp = [_ if _ is None else torch.from_numpy(_).cuda() for _ in tmp]
        fc_feats, att_feats, labels, masks, att_masks = tmp
        
        optimizer.zero_grad()
        if not sc_flag:
            loss = crit(dp_model(fc_feats, att_feats, labels, att_masks), labels[:,1:], masks[:,1:])
        else:
            gen_result, sample_logprobs = dp_model(fc_feats, att_feats, att_masks, opt={'sample_max':0}, mode='sample')
            reward = get_self_critical_reward(dp_model, fc_feats, att_feats, att_masks, data, gen_result, opt)
            loss = rl_crit(sample_logprobs, gen_result.data, torch.from_numpy(reward).float().cuda())

        loss.backward()
        utils.clip_gradient(optimizer, opt.grad_clip)
        optimizer.step()
        train_loss = loss.item()
        torch.cuda.synchronize()
        end = time.time()
        if not sc_flag:
            print("iter {} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \
                .format(iteration, epoch, train_loss, end - start))
        else:
            print("iter {} (epoch {}), avg_reward = {:.3f}, time/batch = {:.3f}" \
                .format(iteration, epoch, np.mean(reward[:,0]), end - start))

        # Update the iteration and epoch
        iteration += 1
        if data['bounds']['wrapped']:
            epoch += 1
            epoch_done = True

        # Write the training loss summary
        if (iteration % opt.losses_log_every == 0):
            add_summary_value(tb_summary_writer, 'train_loss', train_loss, iteration)
            if opt.noamopt:
                opt.current_lr = optimizer.rate()
            elif opt.reduce_on_plateau:
                opt.current_lr = optimizer.current_lr
            add_summary_value(tb_summary_writer, 'learning_rate', opt.current_lr, iteration)
            add_summary_value(tb_summary_writer, 'scheduled_sampling_prob', model.ss_prob, iteration)
            if sc_flag:
                add_summary_value(tb_summary_writer, 'avg_reward', np.mean(reward[:,0]), iteration)

            loss_history[iteration] = train_loss if not sc_flag else np.mean(reward[:,0])
            lr_history[iteration] = opt.current_lr
            ss_prob_history[iteration] = model.ss_prob

        # make evaluation on validation set, and save model
        if (iteration % opt.save_checkpoint_every == 0):
            # eval model
            eval_kwargs = {'split': 'val',
                            'dataset': opt.input_json}
            eval_kwargs.update(vars(opt))
            val_loss, predictions, lang_stats = eval_utils.eval_split(dp_model, crit, loader, eval_kwargs)

            if opt.reduce_on_plateau:
                if 'CIDEr' in lang_stats:
                    optimizer.scheduler_step(-lang_stats['CIDEr'])
                else:
                    optimizer.scheduler_step(val_loss)

            # Write validation result into summary
            add_summary_value(tb_summary_writer, 'validation loss', val_loss, iteration)
            for k,v in lang_stats.items():
                add_summary_value(tb_summary_writer, k, v, iteration)
            val_result_history[iteration] = {'loss': val_loss, 'lang_stats': lang_stats, 'predictions': predictions}

            # Save model if is improving on validation result
            if opt.language_eval == 1:
                current_score = lang_stats['CIDEr']
            else:
                current_score = - val_loss

            best_flag = False
            if True: # if true
                if best_val_score is None or current_score > best_val_score:
                    best_val_score = current_score
                    best_flag = True
                checkpoint_path = os.path.join(opt.checkpoint_path, 'model.pth')
                torch.save(model.state_dict(), checkpoint_path)
                print("model saved to {}".format(checkpoint_path))
                optimizer_path = os.path.join(opt.checkpoint_path, 'optimizer.pth')
                torch.save(optimizer.state_dict(), optimizer_path)

                # Dump miscalleous informations
                infos['iter'] = iteration
                infos['epoch'] = epoch
                infos['iterators'] = loader.iterators
                infos['split_ix'] = loader.split_ix
                infos['best_val_score'] = best_val_score
                infos['opt'] = opt
                infos['vocab'] = loader.get_vocab()

                histories['val_result_history'] = val_result_history
                histories['loss_history'] = loss_history
                histories['lr_history'] = lr_history
                histories['ss_prob_history'] = ss_prob_history
                with open(os.path.join(opt.checkpoint_path, 'infos_'+opt.id+'.pkl'), 'wb') as f:
                    cPickle.dump(infos, f)
                with open(os.path.join(opt.checkpoint_path, 'histories_'+opt.id+'.pkl'), 'wb') as f:
                    cPickle.dump(histories, f)

                if best_flag:
                    checkpoint_path = os.path.join(opt.checkpoint_path, 'model-best.pth')
                    torch.save(model.state_dict(), checkpoint_path)
                    print("model saved to {}".format(checkpoint_path))
                    with open(os.path.join(opt.checkpoint_path, 'infos_'+opt.id+'-best.pkl'), 'wb') as f:
                        cPickle.dump(infos, f)

        # Stop if reaching max epochs
        if epoch >= opt.max_epochs and opt.max_epochs != -1:
            break
for epoch in range(args.epoch):
    accum_loss = 0
    print('epoch: {0}'.format(epoch))
    indexes = np.random.permutation(skip)
    for i in indexes:
        if word_count >= next_count:
            now = time.time()
            duration = now - cur_at
            throuput = 100000. / (now - cur_at)
            print('{} words, {:.2f} sec, {:.2f} words/sec'.format(
                word_count, duration, throuput))
            next_count += 100000
            cur_at = now

        position = np.array(range(0, args.batchsize)) * skip + (args.window +
                                                                i)
        loss = train_model(dataset, position)
        accum_loss += loss.data
        word_count += args.batchsize

        optimizer.zero_grads()
        loss.backward()
        optimizer.update()

    print(accum_loss)

model.to_cpu()
with open('model.pickle', 'wb') as f:
    obj = (model, index2word, word2index)
    pickle.dump(obj, f)
Beispiel #52
0
    def log_checkpoint(self,
                       epoch,
                       val_loss,
                       metrics,
                       predictions,
                       opt,
                       model,
                       dataset,
                       optimizer=None):
        # Write validation result into summary
        if self.tensorboard.tf is not None:
            self.tensorboard.add_summary_value('validation loss', val_loss,
                                               self.iteration)
            for k, v in metrics.items():
                self.tensorboard.add_summary_value(k, v, self.iteration)
                self.tensorboard.writer.flush()
        self.val_result_history[self.iteration] = {
            'loss': val_loss,
            'metrics': metrics,
            'predictions': predictions
        }

        # Save model if the validation result is improved
        if opt.metric == 'XE':
            current_score = -val_loss
        else:
            current_score = metrics[opt.metric]

        best_flag = False
        if self.best_val_score is None or current_score > self.best_val_score:
            self.best_val_score = current_score
            best_flag = True

        # save the model at current iteration
        checkpoint_path = os.path.join(
            self.log_dir, 'model_iter_{}.pth'.format(self.iteration))
        torch.save(model.state_dict(), checkpoint_path)
        # save as latest model
        checkpoint_path = os.path.join(self.log_dir, 'model.pth')
        torch.save(model.state_dict(), checkpoint_path)
        logging.info("model saved to {}".format(checkpoint_path))
        # save optimizer
        if optimizer is not None:
            optimizer_path = os.path.join(self.log_dir, 'optimizer.pth')
            torch.save(optimizer.state_dict(), optimizer_path)

        # Dump miscalleous informations
        self.infos['iter'] = self.iteration
        self.infos['epoch'] = epoch
        self.infos['best_val_score'] = self.best_val_score
        self.infos['opt'] = opt
        self.infos['vocab'] = dataset.get_vocab()

        self.histories['val_result_history'] = self.val_result_history
        self.histories['loss_history'] = self.loss_history
        self.histories['lr_history'] = self.lr_history
        self.histories['ss_prob_history'] = self.ss_prob_history
        with open(os.path.join(self.log_dir, 'infos.pkl'), 'wb') as f:
            cPickle.dump(self.infos, f)
        with open(os.path.join(self.log_dir, 'histories.pkl'), 'wb') as f:
            cPickle.dump(self.histories, f)

        if best_flag:
            checkpoint_path = os.path.join(self.log_dir, 'model-best.pth')
            torch.save(model.state_dict(), checkpoint_path)
            logging.info("model saved to {}".format(checkpoint_path))
            with open(os.path.join(self.log_dir, 'infos-best.pkl'), 'wb') as f:
                cPickle.dump(self.infos, f)
Beispiel #53
0
def setup_servers(the_object_server=object_server, extra_conf=None):
    """
    Setup proxy, account, container and object servers using a set of fake
    rings and policies.

    :param the_object_server: The object server module to use (optional,
                              defaults to swift.obj.server)
    :param extra_conf: A dict of config options that will update the basic
                       config passed to all server instances.
    :returns: A dict containing the following entries:
                  orig_POLICIES: the value of storage_policy.POLICIES prior to
                                 it being patched with fake policies
                  orig_SysLogHandler: the value of utils.SysLogHandler prior to
                                      it being patched
                  testdir: root directory used for test files
                  test_POLICIES: a StoragePolicyCollection of fake policies
                  test_servers: a tuple of test server instances
                  test_sockets: a tuple of sockets used by test servers
                  test_coros: a tuple of greenthreads in which test servers are
                              running
    """
    context = {
        "orig_POLICIES": storage_policy._POLICIES,
        "orig_SysLogHandler": utils.SysLogHandler
    }

    utils.HASH_PATH_SUFFIX = b'endcap'
    utils.SysLogHandler = mock.MagicMock()
    # Since we're starting up a lot here, we're going to test more than
    # just chunked puts; we're also going to test parts of
    # proxy_server.Application we couldn't get to easily otherwise.
    context["testdir"] = _testdir = \
        os.path.join(mkdtemp(), 'tmp_test_proxy_server_chunked')
    mkdirs(_testdir)
    rmtree(_testdir)
    for drive in ('sda1', 'sdb1', 'sdc1', 'sdd1', 'sde1', 'sdf1', 'sdg1',
                  'sdh1', 'sdi1', 'sdj1', 'sdk1', 'sdl1'):
        mkdirs(os.path.join(_testdir, drive, 'tmp'))
    conf = {
        'devices': _testdir,
        'swift_dir': _testdir,
        'mount_check': 'false',
        'allowed_headers':
        'content-encoding, x-object-manifest, content-disposition, foo',
        'allow_versions': 't',
        'node_timeout': 20
    }
    if extra_conf:
        conf.update(extra_conf)
    prolis = listen_zero()
    acc1lis = listen_zero()
    acc2lis = listen_zero()
    con1lis = listen_zero()
    con2lis = listen_zero()
    obj1lis = listen_zero()
    obj2lis = listen_zero()
    obj3lis = listen_zero()
    obj4lis = listen_zero()
    obj5lis = listen_zero()
    obj6lis = listen_zero()
    objsocks = [obj1lis, obj2lis, obj3lis, obj4lis, obj5lis, obj6lis]
    context["test_sockets"] = \
        (prolis, acc1lis, acc2lis, con1lis, con2lis, obj1lis, obj2lis, obj3lis,
         obj4lis, obj5lis, obj6lis)
    account_ring_path = os.path.join(_testdir, 'account.ring.gz')
    account_devs = [
        {
            'port': acc1lis.getsockname()[1]
        },
        {
            'port': acc2lis.getsockname()[1]
        },
    ]
    write_fake_ring(account_ring_path, *account_devs)
    container_ring_path = os.path.join(_testdir, 'container.ring.gz')
    container_devs = [
        {
            'port': con1lis.getsockname()[1]
        },
        {
            'port': con2lis.getsockname()[1]
        },
    ]
    write_fake_ring(container_ring_path, *container_devs)
    storage_policy._POLICIES = storage_policy.StoragePolicyCollection([
        StoragePolicy(0, 'zero', True),
        StoragePolicy(1, 'one', False),
        StoragePolicy(2, 'two', False),
        ECStoragePolicy(3,
                        'ec',
                        ec_type=DEFAULT_TEST_EC_TYPE,
                        ec_ndata=2,
                        ec_nparity=1,
                        ec_segment_size=4096),
        ECStoragePolicy(4,
                        'ec-dup',
                        ec_type=DEFAULT_TEST_EC_TYPE,
                        ec_ndata=2,
                        ec_nparity=1,
                        ec_segment_size=4096,
                        ec_duplication_factor=2)
    ])
    obj_rings = {
        0: ('sda1', 'sdb1'),
        1: ('sdc1', 'sdd1'),
        2: ('sde1', 'sdf1'),
        # sdg1, sdh1, sdi1 taken by policy 3 (see below)
    }
    for policy_index, devices in obj_rings.items():
        policy = storage_policy.POLICIES[policy_index]
        obj_ring_path = os.path.join(_testdir, policy.ring_name + '.ring.gz')
        obj_devs = [{
            'port': objsock.getsockname()[1],
            'device': dev
        } for objsock, dev in zip(objsocks, devices)]
        write_fake_ring(obj_ring_path, *obj_devs)

    # write_fake_ring can't handle a 3-element ring, and the EC policy needs
    # at least 6 devs to work with (ec_k=2, ec_m=1, duplication_factor=2),
    # so we do it manually
    devs = [{
        'id': 0,
        'zone': 0,
        'device': 'sdg1',
        'ip': '127.0.0.1',
        'port': obj1lis.getsockname()[1]
    }, {
        'id': 1,
        'zone': 0,
        'device': 'sdh1',
        'ip': '127.0.0.1',
        'port': obj2lis.getsockname()[1]
    }, {
        'id': 2,
        'zone': 0,
        'device': 'sdi1',
        'ip': '127.0.0.1',
        'port': obj3lis.getsockname()[1]
    }, {
        'id': 3,
        'zone': 0,
        'device': 'sdj1',
        'ip': '127.0.0.1',
        'port': obj4lis.getsockname()[1]
    }, {
        'id': 4,
        'zone': 0,
        'device': 'sdk1',
        'ip': '127.0.0.1',
        'port': obj5lis.getsockname()[1]
    }, {
        'id': 5,
        'zone': 0,
        'device': 'sdl1',
        'ip': '127.0.0.1',
        'port': obj6lis.getsockname()[1]
    }]
    pol3_replica2part2dev_id = [[0, 1, 2, 0], [1, 2, 0, 1], [2, 0, 1, 2]]
    pol4_replica2part2dev_id = [[0, 1, 2, 3], [1, 2, 3, 4], [2, 3, 4, 5],
                                [3, 4, 5, 0], [4, 5, 0, 1], [5, 0, 1, 2]]
    obj3_ring_path = os.path.join(
        _testdir, storage_policy.POLICIES[3].ring_name + '.ring.gz')
    part_shift = 30
    with closing(GzipFile(obj3_ring_path, 'wb')) as fh:
        pickle.dump(RingData(pol3_replica2part2dev_id, devs, part_shift), fh)

    obj4_ring_path = os.path.join(
        _testdir, storage_policy.POLICIES[4].ring_name + '.ring.gz')
    part_shift = 30
    with closing(GzipFile(obj4_ring_path, 'wb')) as fh:
        pickle.dump(RingData(pol4_replica2part2dev_id, devs, part_shift), fh)

    prosrv = proxy_server.Application(conf, logger=debug_logger('proxy'))
    for policy in storage_policy.POLICIES:
        # make sure all the rings are loaded
        prosrv.get_object_ring(policy.idx)
    # don't lose this one!
    context["test_POLICIES"] = storage_policy._POLICIES
    acc1srv = account_server.AccountController(conf,
                                               logger=debug_logger('acct1'))
    acc2srv = account_server.AccountController(conf,
                                               logger=debug_logger('acct2'))
    con1srv = container_server.ContainerController(
        conf, logger=debug_logger('cont1'))
    con2srv = container_server.ContainerController(
        conf, logger=debug_logger('cont2'))
    obj1srv = the_object_server.ObjectController(conf,
                                                 logger=debug_logger('obj1'))
    obj2srv = the_object_server.ObjectController(conf,
                                                 logger=debug_logger('obj2'))
    obj3srv = the_object_server.ObjectController(conf,
                                                 logger=debug_logger('obj3'))
    obj4srv = the_object_server.ObjectController(conf,
                                                 logger=debug_logger('obj4'))
    obj5srv = the_object_server.ObjectController(conf,
                                                 logger=debug_logger('obj5'))
    obj6srv = the_object_server.ObjectController(conf,
                                                 logger=debug_logger('obj6'))
    context["test_servers"] = \
        (prosrv, acc1srv, acc2srv, con1srv, con2srv, obj1srv, obj2srv, obj3srv,
         obj4srv, obj5srv, obj6srv)
    nl = NullLogger()
    logging_prosv = proxy_logging.ProxyLoggingMiddleware(
        listing_formats.ListingFilter(prosrv, {}, logger=prosrv.logger),
        conf,
        logger=prosrv.logger)
    prospa = spawn(wsgi.server,
                   prolis,
                   logging_prosv,
                   nl,
                   protocol=SwiftHttpProtocol,
                   capitalize_response_headers=False)
    acc1spa = spawn(wsgi.server,
                    acc1lis,
                    acc1srv,
                    nl,
                    protocol=SwiftHttpProtocol,
                    capitalize_response_headers=False)
    acc2spa = spawn(wsgi.server,
                    acc2lis,
                    acc2srv,
                    nl,
                    protocol=SwiftHttpProtocol,
                    capitalize_response_headers=False)
    con1spa = spawn(wsgi.server,
                    con1lis,
                    con1srv,
                    nl,
                    protocol=SwiftHttpProtocol,
                    capitalize_response_headers=False)
    con2spa = spawn(wsgi.server,
                    con2lis,
                    con2srv,
                    nl,
                    protocol=SwiftHttpProtocol,
                    capitalize_response_headers=False)
    obj1spa = spawn(wsgi.server,
                    obj1lis,
                    obj1srv,
                    nl,
                    protocol=SwiftHttpProtocol,
                    capitalize_response_headers=False)
    obj2spa = spawn(wsgi.server,
                    obj2lis,
                    obj2srv,
                    nl,
                    protocol=SwiftHttpProtocol,
                    capitalize_response_headers=False)
    obj3spa = spawn(wsgi.server,
                    obj3lis,
                    obj3srv,
                    nl,
                    protocol=SwiftHttpProtocol,
                    capitalize_response_headers=False)
    obj4spa = spawn(wsgi.server,
                    obj4lis,
                    obj4srv,
                    nl,
                    protocol=SwiftHttpProtocol,
                    capitalize_response_headers=False)
    obj5spa = spawn(wsgi.server,
                    obj5lis,
                    obj5srv,
                    nl,
                    protocol=SwiftHttpProtocol,
                    capitalize_response_headers=False)
    obj6spa = spawn(wsgi.server,
                    obj6lis,
                    obj6srv,
                    nl,
                    protocol=SwiftHttpProtocol,
                    capitalize_response_headers=False)
    context["test_coros"] = \
        (prospa, acc1spa, acc2spa, con1spa, con2spa, obj1spa, obj2spa, obj3spa,
         obj4spa, obj5spa, obj6spa)
    # Create account
    ts = normalize_timestamp(time.time())
    partition, nodes = prosrv.account_ring.get_nodes('a')
    for node in nodes:
        conn = swift.proxy.controllers.obj.http_connect(
            node['ip'], node['port'], node['device'], partition, 'PUT', '/a', {
                'X-Timestamp': ts,
                'x-trans-id': 'test'
            })
        resp = conn.getresponse()
        assert (resp.status == 201)
    # Create another account
    # used for account-to-account tests
    ts = normalize_timestamp(time.time())
    partition, nodes = prosrv.account_ring.get_nodes('a1')
    for node in nodes:
        conn = swift.proxy.controllers.obj.http_connect(
            node['ip'], node['port'], node['device'], partition, 'PUT', '/a1',
            {
                'X-Timestamp': ts,
                'x-trans-id': 'test'
            })
        resp = conn.getresponse()
        assert (resp.status == 201)
    # Create containers, 1 per test policy
    sock = connect_tcp(('localhost', prolis.getsockname()[1]))
    fd = sock.makefile('rwb')
    fd.write(b'PUT /v1/a/c HTTP/1.1\r\nHost: localhost\r\n'
             b'Connection: close\r\nX-Auth-Token: t\r\n'
             b'Content-Length: 0\r\n\r\n')
    fd.flush()
    headers = readuntil2crlfs(fd)
    exp = b'HTTP/1.1 201'
    assert headers[:len(exp)] == exp, "Expected '%s', encountered '%s'" % (
        exp, headers[:len(exp)])
    # Create container in other account
    # used for account-to-account tests
    sock = connect_tcp(('localhost', prolis.getsockname()[1]))
    fd = sock.makefile('rwb')
    fd.write(b'PUT /v1/a1/c1 HTTP/1.1\r\nHost: localhost\r\n'
             b'Connection: close\r\nX-Auth-Token: t\r\n'
             b'Content-Length: 0\r\n\r\n')
    fd.flush()
    headers = readuntil2crlfs(fd)
    exp = b'HTTP/1.1 201'
    assert headers[:len(exp)] == exp, "Expected '%s', encountered '%s'" % (
        exp, headers[:len(exp)])

    sock = connect_tcp(('localhost', prolis.getsockname()[1]))
    fd = sock.makefile('rwb')
    fd.write(
        b'PUT /v1/a/c1 HTTP/1.1\r\nHost: localhost\r\n'
        b'Connection: close\r\nX-Auth-Token: t\r\nX-Storage-Policy: one\r\n'
        b'Content-Length: 0\r\n\r\n')
    fd.flush()
    headers = readuntil2crlfs(fd)
    exp = b'HTTP/1.1 201'
    assert headers[:len(exp)] == exp, \
        "Expected %r, encountered %r" % (exp, headers[:len(exp)])

    sock = connect_tcp(('localhost', prolis.getsockname()[1]))
    fd = sock.makefile('rwb')
    fd.write(
        b'PUT /v1/a/c2 HTTP/1.1\r\nHost: localhost\r\n'
        b'Connection: close\r\nX-Auth-Token: t\r\nX-Storage-Policy: two\r\n'
        b'Content-Length: 0\r\n\r\n')
    fd.flush()
    headers = readuntil2crlfs(fd)
    exp = b'HTTP/1.1 201'
    assert headers[:len(exp)] == exp, \
        "Expected '%s', encountered '%s'" % (exp, headers[:len(exp)])
    return context
Beispiel #54
0
 def persist(self, filename=None):
     if filename is None:
         filename = self.filename
     with open(filename, 'w') as f:
         pickle.dump(self.cache, f)
Beispiel #55
0
            for cps in list(cCP.values()):
                if not cps == cp:
                    b[cp].add_others(t, cps, CoPoMap[t, x, y].nTrCP[cp],
                                     CoPoMap[t, x, y].nTrCP[cps])

###############################################################
# SAVE DATA
############################################################
btime = datetime.datetime.now()

print 'took ', (btime - atime)

#if not os.path.exists(odir+EXPID+'/tempdata/'):
# os.makedirs(odir+EXPID+'/tempdata/')
f = open(odir + EXPID + '/output/cp/Tracer.save', 'wb')
cPickle.dump(data, f, protocol=cPickle.HIGHEST_PROTOCOL)
f.close()

if lmap or lmap2:
    f = open(odir + EXPID + '/output/cp/TracerMap.save', 'wb')
    cPickle.dump(CoPoMap, f, protocol=cPickle.HIGHEST_PROTOCOL)
    f.close()
    print 'lmap took:', sumlmaptime

if lcpstart:
    f = open(odir + EXPID + '/output/cp/CPstart.save', 'wb')
    cPickle.dump(CPinit, f, protocol=cPickle.HIGHEST_PROTOCOL)
    f.close()
    print 'lcpstart took:', sumlcpstarttime

f = open(odir + EXPID + '/output/cp/CPlife.save', 'wb')
Beispiel #56
0
            model_save_path = os.path.join(sstype_path,
                                           rbp_name + '_' + cell_name)
            nntrainer = nn.NeuralTrainer(nnmodel,
                                         save='best',
                                         file_path=model_save_path)

            # initialize session
            sess = utils.initialize_session(nnmodel.placeholders)

            # load best model
            nntrainer.set_best_parameters(sess)

            # test model on validation set
            loss, mean_vals, std_vals = nntrainer.test_model(sess,
                                                             test,
                                                             batch_size=128,
                                                             name='test',
                                                             verbose=1)

            # store results
            results.append(mean_vals)

            sess.close()
            # save results

        # store results
        with open(os.path.join(sstype_path, 'test_scores.pickle'), 'wb') as f:
            cPickle.dump(np.array(results),
                         f,
                         protocol=cPickle.HIGHEST_PROTOCOL)
Beispiel #57
0
def train(args):
    data_loader = TextLoader(args.data_dir, args.batch_size, args.seq_length)
    args.vocab_size = data_loader.vocab_size

    # check compatibility if training is continued from previously saved model
    if args.init_from is not None:
        # check if all necessary files exist
        assert os.path.isdir(
            args.init_from), " %s must be a a path" % args.init_from
        assert os.path.isfile(
            os.path.join(args.init_from, "config.pkl")
        ), "config.pkl file does not exist in path %s" % args.init_from
        assert os.path.isfile(
            os.path.join(args.init_from, "chars_vocab.pkl")
        ), "chars_vocab.pkl.pkl file does not exist in path %s" % args.init_from
        ckpt = tf.train.get_checkpoint_state(args.init_from)
        assert ckpt, "No checkpoint found"
        assert ckpt.model_checkpoint_path, "No model path found in checkpoint"

        # open old config and check if models are compatible
        with open(os.path.join(args.init_from, 'config.pkl'), 'rb') as f:
            saved_model_args = cPickle.load(f)
        need_be_same = ["model", "rnn_size", "num_layers", "seq_length"]
        for checkme in need_be_same:
            assert vars(saved_model_args)[checkme] == vars(
                args
            )[checkme], "Command line argument and saved model disagree on '%s' " % checkme

        # open saved vocab/dict and check if vocabs/dicts are compatible
        with open(os.path.join(args.init_from, 'chars_vocab.pkl'), 'rb') as f:
            saved_chars, saved_vocab = cPickle.load(f)
        assert saved_chars == data_loader.chars, "Data and loaded model disagree on character set!"
        assert saved_vocab == data_loader.vocab, "Data and loaded model disagree on dictionary mappings!"

    if not os.path.isdir(args.save_dir):
        os.makedirs(args.save_dir)
    with open(os.path.join(args.save_dir, 'config.pkl'), 'wb') as f:
        cPickle.dump(args, f)
    with open(os.path.join(args.save_dir, 'chars_vocab.pkl'), 'wb') as f:
        cPickle.dump((data_loader.chars, data_loader.vocab), f)

    model = Model(args)

    with tf.Session() as sess:
        # instrument for tensorboard
        summaries = tf.summary.merge_all()
        writer = tf.summary.FileWriter(
            os.path.join(args.log_dir, time.strftime("%Y-%m-%d-%H-%M-%S")))
        writer.add_graph(sess.graph)

        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver(tf.global_variables())
        # restore model
        if args.init_from is not None:
            saver.restore(sess, ckpt.model_checkpoint_path)
        for e in range(args.num_epochs):
            sess.run(
                tf.assign(model.lr, args.learning_rate * (args.decay_rate**e)))
            data_loader.reset_batch_pointer()
            state = sess.run(model.initial_state)
            for b in range(data_loader.num_batches):
                start = time.time()
                x, y = data_loader.next_batch()
                feed = {model.input_data: x, model.targets: y}
                for i, (c, h) in enumerate(model.initial_state):
                    feed[c] = state[i].c
                    feed[h] = state[i].h

                # instrument for tensorboard
                summ, train_loss, state, _ = sess.run(
                    [summaries, model.cost, model.final_state, model.train_op],
                    feed)
                writer.add_summary(summ, e * data_loader.num_batches + b)

                end = time.time()
                print(
                    "{}/{} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}"
                    .format(e * data_loader.num_batches + b,
                            args.num_epochs * data_loader.num_batches, e,
                            train_loss, end - start))
                if (e * data_loader.num_batches + b) % args.save_every == 0\
                        or (e == args.num_epochs-1 and
                            b == data_loader.num_batches-1):
                    # save for the last result
                    checkpoint_path = os.path.join(args.save_dir, 'model.ckpt')
                    saver.save(sess,
                               checkpoint_path,
                               global_step=e * data_loader.num_batches + b)
                    print("model saved to {}".format(checkpoint_path))
Beispiel #58
0
def train_lstm(
        dim_proj=128,  #词嵌入的维数 和 LSTM隐藏层单元的数目
        patience=10,  #Number of epoch to wait before early stop if no progress
        max_epochs=5000,  # The maximum number of epoch to run
        dispFreq=10,  #Display to stdout the training progress every N updates
        decay_c=0.,  #Weight decay(衰减) for the classifier applied to the U weights
        lrate=0.0001,  #随机梯度下降学习率(not used for adadelta and rmsprop)
        n_words=10000,  #词典大小
        optimizer=adadelta,  #可以用sgd,adadelta and rmsprop,sgd使用非常困难
        encoder='lstm',  #使用lstm网络
        saveto='lstm_model.npz',  #将最好的训练模型存储为lstm_model.npz
        validFreq=370,  #更新370后验证错误率
        saveFreq=1110,  #每隔1110次迭代保存一次参数
        maxlen=100,  #最大序列值
        batch_size=16,  #训练时处理数据的批大小
        valid_batch_size=64,  #用于测试的批大小
        dataset='lmdb',  #使用lmdb数据集

        #其他的一些参数
    noise_std=0,  #噪声
        use_dropout=True,  #使用dropout, if False slightly faster, but worst test error 
        # # This frequently need a bigger model.
    reload_model=None,  #保存模型数据的路径
        test_size=-1,  #当test_size大于0,用来保存测试样本的数量
):

    #模型选择
    model_options = locals().copy(
    )  #return a dictionary containing the current scope's local variables.
    print("model options", model_options)
    #加载数据
    load_data, prepare_data = get_dataset(dataset)

    print('Loading data')
    #加载训练,测试,评价数据集
    train, valid, test = load_data(n_words=n_words,
                                   valid_portion=0.05,
                                   maxlen=maxlen)  #用于模型评价的数据集为5%

    if test_size > 0:
        #由于测试集为根据数据大小排序好的数据,随机的选择测试集
        idx = np.arange(len(test[0]))
        np.random.shuffle(idx)  #shuffle操作
        idx = idx[:test_size]
        test = ([test[0][n] for n in range(idx)], [test[1][n] for n in idx])

    ydim = np.max(train[1]) + 1

    model_options['ydim'] = ydim

    print('Building model')
    # This create the initial parameters as numpy ndarrays.
    # Dict name (string) -> numpy ndarray
    params = init_params(model_options)

    if reload_model:
        load_params('lstm_model.npz', params)

    # This create Theano Shared Variable from the parameters.
    # Dict name (string) -> Theano Tensor Shared Variable
    # params and tparams have different copy of the weights.
    tparams = init_tparams(params)

    # use_noise is for dropout
    (use_noise, x, mask, y, f_pred_prob, f_pred,
     cost) = build_model(tparams, model_options)

    if decay_c > 0.:
        decay_c = theano.shared(numpy_floatX(decay_c), name='decay_c')
        weight_decay = 0.
        weight_decay += (tparams['U']**2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    f_cost = theano.function([x, mask, y], cost, name='f_cost')

    grads = tensor.grad(cost, wrt=list(tparams.values()))
    f_grad = theano.function([x, mask, y], grads, name='f_grad')

    lr = tensor.scalar(name='lr')
    f_grad_shared, f_update = optimizer(lr, tparams, grads, x, mask, y, cost)

    print('Optimization')

    kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size)
    kf_test = get_minibatches_idx(len(test[0]), valid_batch_size)

    print("%d train examples" % len(train[0]))
    print("%d valid examples" % len(valid[0]))
    print("%d test examples" % len(test[0]))

    history_errs = []
    best_p = None
    bad_count = 0

    if validFreq == -1:
        validFreq = len(train[0]) // batch_size
    if saveFreq == -1:
        saveFreq = len(train[0]) // batch_size

    uidx = 0  # the number of update done
    estop = False  # early stop
    start_time = time.time()
    try:
        for eidx in range(max_epochs):
            n_samples = 0

            # Get new shuffled index for the training set.
            kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True)

            for _, train_index in kf:
                uidx += 1
                use_noise.set_value(1.)

                # Select the random examples for this minibatch
                y = [train[1][t] for t in train_index]
                x = [train[0][t] for t in train_index]

                # Get the data in numpy.ndarray format
                # This swap the axis!
                # Return something of shape (minibatch maxlen, n samples)
                x, mask, y = prepare_data(x, y)
                n_samples += x.shape[1]

                cost = f_grad_shared(x, mask, y)
                f_update(lrate)

                if np.isnan(cost) or np.isinf(cost):
                    print('bad cost detected: ', cost)
                    return 1., 1., 1.

                if np.mod(uidx, dispFreq) == 0:
                    print('Epoch ', eidx, 'Update ', uidx, 'Cost ', cost)

                if saveto and np.mod(uidx, saveFreq) == 0:
                    print('Saving...')

                    if best_p is not None:
                        params = best_p
                    else:
                        params = unzip(tparams)
                    np.savez(saveto, history_errs=history_errs, **params)
                    pickle.dump(model_options, open('%s.pkl' % saveto, 'wb'),
                                -1)
                    print('Done')

                if np.mod(uidx, validFreq) == 0:
                    use_noise.set_value(0.)
                    train_err = pred_error(f_pred, prepare_data, train, kf)
                    valid_err = pred_error(f_pred, prepare_data, valid,
                                           kf_valid)
                    test_err = pred_error(f_pred, prepare_data, test, kf_test)

                    history_errs.append([valid_err, test_err])

                    if (best_p is None or
                            valid_err <= np.array(history_errs)[:, 0].min()):
                        best_p = unzip(tparams)
                        bad_counter = 0

                    print('Train ', train_err, 'Valid ', valid_err, 'Test ',
                          test_err)

                    if (len(history_errs) > patience and valid_err >=
                            np.array(history_errs)[:-patience, 0].min()):
                        bad_counter += 1
                        if bad_counter > patience:
                            print('Early Stop!')
                            estop = True
                            break

            print('Seen %d samples' % n_samples)

            if estop:
                break

    except KeyboardInterrupt:
        print("Training interupted")

    end_time = time.time()
    if best_p is not None:
        zipp(best_p, tparams)
    else:
        best_p = unzip(tparams)

    use_noise.set_value(0.)
    kf_train_sorted = get_minibatches_idx(len(train[0]), batch_size)
    train_err = pred_error(f_pred, prepare_data, train, kf_train_sorted)
    valid_err = pred_error(f_pred, prepare_data, valid, kf_valid)
    test_err = pred_error(f_pred, prepare_data, test, kf_test)

    print('Train ', train_err, 'Valid ', valid_err, 'Test ', test_err)
    if saveto:
        np.savez(saveto,
                 train_err=train_err,
                 valid_err=valid_err,
                 test_err=test_err,
                 history_errs=history_errs,
                 **best_p)
    print('The code run for %d epochs, with %f sec/epochs' %
          ((eidx + 1), (end_time - start_time) / (1. * (eidx + 1))))
    print(('Training took %.1fs' % (end_time - start_time)), file=sys.stderr)
    return train_err, valid_err, test_err
def save_dict(di_, filename):
    with open(filename, "wb") as f:
        pickle.dump(di_, f)
# assume you have a data file named 'example.pickle'
# retrive the data from the saved pickle file
    try:
        f_read = open('example.pickle', 'rb')    # open the file for reading
        mydata = pickle.load(f_read)['evoked_EMGs']
        f_read.close()
    except Exception as e: # capture the objects from the exception
        print('Unable to read data from', all_files, ':', e) 
        raise
        
        
        
        
#%% peudo code, save data into pickle file

pickle_file_name = 'myfile.pickle'   # assume the file name in pickle format named as 'myfile.pickle'
pickle_file = os.path.join(directory, pickle_file_name)  # directory is where you want to save you data file

try:
  f = open(pickle_file, 'wb')                   # open the file for writing
  # save the save into a dictionary format
  save = {
    'key1': data1,   # key and value pair
    'key2': data2,  
    }
  pickle.dump(save, f, pickle.HIGHEST_PROTOCOL) # save all the data into the file named in pickle_file, 
                                                # use the highest protocol version available
  f.close()
except Exception as e:
  print('Unable to save data to', pickle_file, ':', e)
  raise