def fit(self, x, y, num_epochs=20, steps_per_epoch=200, batch_size=1000, validate=False, x_val: Optional[np.ndarray] = None, y_val: Optional[np.ndarray] = None) -> None: losses = [] val_losses = [] val_max_open_losses = [] val_max_cons_losses = [] val_max_dist_losses = [] for epoch_no in range(num_epochs): print("Epoch {epoch_no}/{num_epochs}".format(epoch_no=epoch_no, num_epochs=num_epochs)) bar = Bar('', max=steps_per_epoch, suffix='%(index)d/%(max)d ETA: %(eta)ds') for step_no in range(steps_per_epoch): eff_batch_size = min(batch_size, x.shape[0]) batch_idx = random.choices(population=list(range(x.shape[0])), k=eff_batch_size) batch_x = x[batch_idx] batch_y = y[batch_idx] loss, train = self.sess.run([self.loss, self.train_op], feed_dict={self.x: batch_x, self.y: batch_y}) losses.append(loss) bar.message = 'loss: {loss:.8f}'.format(loss=np.mean(losses[-steps_per_epoch:])) bar.next() bar.finish() if validate: val_loss, val_max_open_loss, val_max_cons_loss, val_max_dist_loss =\ self.evaluate(x_val, y_val, batch_size=batch_size) val_losses.append(val_loss) val_max_open_losses.append(val_max_open_loss) val_max_cons_losses.append(val_max_cons_loss) val_max_dist_losses.append(val_max_dist_loss) print("Validation loss: {val_loss:.8f}".format(val_loss=val_loss)) # Plot self.plot_val_losses(val_losses, val_max_open_losses, val_max_cons_losses, val_max_dist_losses)
def validator(file_map, result_filename, thread_count): headers = { 'cache-control': "no-cache", 'content-type': "application/x-www-form-urlencoded" } result_file = open(result_filename, 'w', encoding='utf8') with futures.ThreadPoolExecutor(max_workers=thread_count) as executor: future_to_url = {} valid_emails = [] while True: line = file_map.readline() if line: email = line.split(b';')[0] else: break future_to_url[executor.submit(load_url, email)] = line bar = Bar('Valid/Invalid: 0/0', max=len(future_to_url.keys()), suffix='%(percent)d%%') resp_err = 0 resp_ok = 0 valid = 0 invalid = 0 for future in futures.as_completed(future_to_url): line = future_to_url[future] try: response = future.result() if response.ok: json = response.json() if json['status'] == 'Valid': valid += 1 valid_emails.append('{}\n'.format(line.decode("utf-8").replace('\n',''))) # use our own new line else: invalid += 1 bar.message = 'Valid/Invalid: {}/{}'.format(valid, invalid) else: print(b"Error in request for email: " + line) response.raise_for_status() except Exception as exc: resp_err = resp_err + 1 else: resp_ok = resp_ok + 1 bar.next() print('\nSuccessful runs: {}'.format(resp_ok)) print('Failed runs: {}'.format(resp_err)) result_file.writelines(valid_emails) result_file.close()
def encode_syllables(variables, encoder, session, segs, kernel_only): num_segs = len(segs) batch_size = 200 extractor = variables['extractor'] denormalised = variables['denormalised'] global_max = variables.get('global_max', None) global_min = variables.get('global_min', None) global_range = global_max - global_min num_batches = num_segs // batch_size if num_segs / batch_size > num_batches: num_batches += 1 seg_idx = -1 encoding_result = {} bar = Bar('', max=num_segs) for batch_idx in range(num_batches): if batch_idx == num_batches - 1: batch_size = num_segs - (batch_size * batch_idx) bar.message = 'Batch #{}/#{} batch size {}'.format( batch_idx, num_batches, batch_size) lengths = [] batch_segs = [] spects = [] for idx in range(batch_size): seg_idx += 1 seg = segs[seg_idx] batch_segs.append(seg) spect = spect_from_seg(seg, extractor) if denormalised: spect = (spect - global_min) / global_range dims, length = spect.shape lengths.append(length) spects.append(spect.T) bar.next() encoded = encoder.encode(spects, session=session, kernel_only=kernel_only) for encod, seg, length in zip(encoded, batch_segs, lengths): encoding_result[seg.id] = encod bar.finish() return encoding_result
def train_and_evaluate(self) -> ClassificationResults: """ Train and evaluate model. """ with tf.Session() as self.sess: # Initialize computation graph. self._create_model() # Add visualizations to computation graph. self._visualize_kernels() self._visualize_exciting_patches() self._visualize_incorrect_answer_images() # Initialize variables. if self.ckpt_file: saver = tf.train.Saver() try: saver.restore(self.sess, self.ckpt_file) except (tf.errors.InvalidArgumentError, tf.errors.NotFoundError): tf.global_variables_initializer().run() else: tf.global_variables_initializer().run() # Initialize summary writer. self.writer = tf.summary.FileWriter(logdir='conv_vis') # Initialize progress bar. bar = Bar('', max=self.steps_per_epoch, suffix='%(index)d/%(max)d ETA: %(eta)ds') for epoch_no in range(self.nb_epochs): self.logger.info("Epoch {epoch_no}/{nb_epochs}".format( epoch_no=epoch_no, nb_epochs=self.nb_epochs)) for step_no in range(self.steps_per_epoch): # Train model on next batch batch_x, batch_y = self._next_training_batch() results = self.train_on_batch(batch_x, batch_y) # Update bar bar.message = 'loss: {0[0]:.8f} acc: {0[1]:.3f} mean_acc: {1:.3f}'. \ format(results, np.mean(self.accs[-1000:]), ) bar.next() # Re-initialize progress bar bar.finish() bar = Bar('', max=self.steps_per_epoch, suffix='%(index)d/%(max)d ETA: %(eta)ds') # Store model if self.ckpt_file: saver.save(self.sess, self.ckpt_file) # Validate val_results = self.validate(global_step=epoch_no) loss, acc, auc_roc = val_results.loss, val_results.acc, val_results.get_auc_roc( ) if self.binary_classification: self.logger.info( "Validation results: Loss: {0}, accuracy: {1}, auc_roc: {2}" .format(loss, acc, auc_roc)) else: self.logger.info( "Validation results: Loss: {0}, accuracy: {1}".format( loss, acc)) # Dipslay confusion matrix show_image(self._confusion_matrix) return val_results
def pretty(stream): ''' Read from fifo pipe and output a formatted stream to stdout. ''' progress = None started_tasks = 1 error_messages = {} last_event = None try: while True: for line in iter(stream.readline, ""): event = json.loads(line) if event['tag'] == 'playbook_start': print banner(event['title']) print bcolors.WARNING + "Contains: " + str(json.loads(event['text'])['plays']) + " Play(s)." + bcolors.ENDC last_event = event['tag'] elif event['tag'] == 'play_start': if progress: progress.next() started_tasks = 1 num_tasks = json.loads(event['text'])['tasks'] if last_event != 'playbook_start': print "\n" + banner(event['title']) else: print banner(event['title']) print bcolors.WARNING + "Contains: " + str(json.loads(event['text'])['tasks']) + " Task(s)." + bcolors.ENDC + "\n" print "TASK(s):" progress = Bar("Processing...", max=num_tasks, suffix=SUFFIX) progress.update() last_event = event['tag'] elif event['tag'] == 'task_start': if progress: progress.message = event['title'] progress.update() if started_tasks > 1: progress.next() started_tasks = started_tasks + 1 last_event = event['tag'] elif event['tag'] == 'playbook_complete': if progress: progress.next() progress.finish() print banner(event['title']) print banner("RUN Statistics:") print output_statistics(event['text']) if len(error_messages) > 0: print banner("RUN Errors:") print output_errors(error_messages) last_event = event['tag'] elif event['tag'] == 'unreachable': error_messages[event['host']] = event['text'] last_event = event['tag'] except KeyboardInterrupt: stream.flush() stream.close()
def pecuzal_embedding(s, taus = range(50), theiler = 1, sample_size = 1., K = 13, KNN = 3, L_threshold = 0.0, alpha = 0.05, p = 0.5, max_cycles = 10, econ = False): '''Performs an embedding of time series using the PECUZAL method Parameters ---------- s : 'numpy.ndarray' (N, M) Input time series of length N as numpy array. This can be a multivariate set, where the M timeseries are stored in the columns. taus : `iterable`, optional Possible delay values in sampling time units (Default is `taus=range(50)`). For each of the `tau`'s in `taus` the continuity statistic `avrg_eps_star` gets computed and further processed in order to find optimal delays for each embedding cycle. theiler : `int`, optional Theiler window for excluding serial correlated points from neighbourhood. In sampling time units, Default is `theiler = 1`. sample_size : `float`, optional Number of considered fiducial points as a fraction of input time series length, i.e. a float from interval (0,1.] (Default is `sample_size = 1.0`, i.e., all points of the acutal trajectory get considered). K : `int`, optional The amount of nearest neighbors in the Delta-ball. Must be at least 8 (in order to guarantee a valid statistic) and the Default is `K = 13`. The continuity statistic `avrg_eps_star` is computed in each embedding cycle, taking the minimum result over all `k in K`. KNN : `int`, optional The number of nearest neighbors to be considered in the L-statistic, Default is `KNN = 3`. L_threshold : `float`, optional The algorithm breaks, when this threshold is exceeded by `ΔL` in an embedding cycle (set as a positive number, i.e. an absolute value of `ΔL`). alpha : `float`, optional Significance level for obtaining the continuity statistic `avrg_eps_star` in each embedding cycle (Default is `alpha = 0.05`). p : `float`, optional Binominal p for obtaining the continuity statistic `avrg_eps_star` in each embedding cycle (Default is `p = 0.5`). max_cycles : `int`, optional The algorithm will stop after that many cycles no matter what. Default is `max_cycles = 10`. econ : `bool`, optional Economy-mode for L-statistic computation. Instead of computing L-statistics for time horizons `2:Tw`, here we only compute them for `2:2:Tw`. Returns ------- Y : 'numpy.ndarray' (N', m) The trajectory from the embedding of length `N' = N-sum(tau_vals)` of dimension `m` (embedding dimension) tau_vals : 'list' [`int`] The chosen delay values for each embedding cycle, `len(tau_vals) = m`. ts_vals : 'list' [`int`] The according time series number (index) chosen for each delay value in `tau_vals`, `len(ts_vals) = m`. For univariate embedding `ts_vals` is a vector of zeros of length `tau_vals`, because there is simply just one time series to choose from, i.e. index 0. Ls : 'list' The :math:`\\Delta L`-statistic for each embedding cycle, including the very last encountered cycle, which will not enter the final trajectory `Y`. The total decrease of :math:`\\Delta L` is, thus, :math:`\\Delta L_t = np.sum(Ls[:-1])`. avrg_eps_stars : 'list' [`list`] The continuity statistics for each embedding cycle. Contains `avrg_eps_star` of each embedding cycle. See also -------- uzal_cost uzal_cost_pecuzal continuity_statistic Notes ----- The method works iteratively and gradually builds the final embedding vectors `Y`, as proposed in [kraemer2020]_ . Based on the continuity statistic `avrg_eps_star` [pecora2007]_ the algorithm picks an optimal delay value `tau_i` for each embedding cycle `i`. For achieving that, we take the inpute time series `s`, denoted as the actual phase space trajectory `Y_actual` and compute the continuity statistic `avrg_eps_star`. 1. Each local maxima in `avrg_eps_star` is used for constructing a candidate embedding trajectory `Y_trial` with a delay corresponding to that specific peak in `avrg_eps_star`. 2. We then compute the `L`-statistic [uzal2011]_ for `Y_trial` (`L-trial`) and `Y_actual` (`L_actual`) for increasing prediction time horizons (free parameter in the `L`-statistic) and save the maximum difference `max(L-trial - L_actual)` as :math:`\\Delta L` (Note that this is a negative number, since the `L`-statistic decreases with better reconstructions). 3. We pick the peak/`tau`-value, for which :math:`\\Delta L` is minimal (=maximum decrease of the overall `L`-value) and construct the actual embedding trajectory `Y_actual` (steps 1.-3. correspond to an embedding cycle). 4. We repeat steps 1.-3. with `Y_actual` as input and stop the algorithm when :math:`\\Delta L` is > 0, i.e. when and additional embedding component would not lead to a lower overall L-value. `Y_actual` -> `Y`. In case of multivariate embedding, i.e. when embedding a set of `M` time series, in each embedding cycle the continuity statistic `avrg_eps_star` gets computed for all `M` time series available. The optimal delay value `tau_i` in each embedding cycle `i` is chosen as the peak/`tau`-value for which :math:`\\Delta L` is minimal under all available peaks and under all M `avrg_eps_star`'s. In the first embedding cycle there will be :math:`M^2` different `avrg_eps_star`'s to consider, since it is not clear a priori which time series of the input should consitute the first component of the embedding vector and form `Y_actual`. The range of considered delay values is determined in `taus` and for the nearest neighbor search we respect the Theiler window `theiler`. The final embedding vector is stored in `Y` (`numpy.ndarray`). The chosen delay values for each embedding cycle are stored in `tau_vals` (`list` of `int`) and the according time series numbers chosen for each delay value in `tau_vals` are stored in `ts_vals`. For univariate embedding `ts_vals` is a vector of ones of length `len(tau_vals)`, because there is simply just one time series to choose from. The function also returns the :math:`\\Delta Ls`-values `Ls` for each embedding cycle and the continuity statistic `avrg_eps_stars` as `list` of `list`s. For distance computations the Euclidean norm is used. References ---------- .. [pecora2007] Pecora et al., "A unified approach to attractor reconstruction", Chaos, vol. 17, 013110, 2007. https://doi.org/10.1063/1.2430294 .. [uzal2011] Uzal et al., "Optimal reconstruction of dynamical systems: A noise amplification approach", Physical Review E, vol. 84, 016223, 2011. https://doi.org/10.1103/PhysRevE.84.016223 ''' if np.ndim(s)>1: assert (np.size(s,0) > np.size(s,1)), "You must provide a numpy array storing the time series in its columns." D = np.size(s,1) else: D = 1 assert (K >= 8) and (type(K) is int) and (K < len(s)) , "You must provide a delta-neighborhood size consisting of at least 8 neighbors." assert (KNN >= 1) and (type(KNN) is int), "You must provide a valid integer number of considered nearest neighbours for the computation of the L-statistic." assert (sample_size > 0) and (sample_size <= 1), "sample_size must be in (0 1]" assert (theiler >= 0) and (type(theiler) is int) and (theiler < len(s)), "Theiler window must be a positive integer smaller than the time series length." assert (alpha >= 0) and (alpha < 1), "Significance level alpha must be in (0 1)" assert (p >= 0) and (p < 1), "Binomial p parameter must be in (0 1)" assert (L_threshold >= 0), "L_threshold must be given as an absolute value, i.e. a positive number." assert (type(econ) is bool), "econ parameter must be a Boolean. Set to True or False (default)" norm = 'euclidean' threshold = -L_threshold s_orig = s s = zscore(s,ddof=1) # especially important for comparative L-statistics # define actual phase space trajectory Y_act = [] # set a flag, in order to tell the while loop when to stop. Each loop # stands for encountering a new embedding dimension flag, counter = True, 0 # preallocate output variables tau_vals = [0] ts_vals = [] Ls = [] eps = np.empty(shape=(len(taus), max_cycles)) # loop over increasing embedding dimensions until some break criterion will # tell the loop to stop/break bar = Bar('PECUZAL embeds your time series: Executing embedding cycle no.: 1', max=max_cycles) while flag: bar.next() bar.message = 'PECUZAL embeds your time series: Executing embedding cycle no.:{}'.format(counter+2) Y_act, tau_vals, ts_vals, Ls, eps = pecuzal_multivariate_embedding_cycle( Y_act, flag, s, taus, theiler, counter, eps, tau_vals, norm, Ls, ts_vals, sample_size, K, alpha, p, KNN, econ) flag = pecuzal_break_criterion(Ls, counter, max_cycles, threshold) counter += 1 bar.finish() # construct final reconstruction vector if D > 1: Y_final = s_orig[:,ts_vals[0]] for i in range(len(tau_vals[:-2])): Y_final = hcat_lagged_values(Y_final,s_orig[:,ts_vals[i+1]],tau_vals[i+1]) else: Y_final = s_orig for i in range(len(tau_vals[:-2])): Y_final = hcat_lagged_values(Y_final,s_orig,tau_vals[i+1]) return Y_final, tau_vals[:-1], ts_vals[:-1], Ls[:-1], eps[:,:counter]
def backup(self, local_dir, delete_files=True, dry_run=False): logging.debug('backing up directory ' + local_dir) if not common.is_dir(local_dir): logging.error("local directory " + local_dir + " not found. Cannot continue!") raise Exception("Local directory " + local_dir + " not found") local_clfiles = self.index_local_dir(local_dir, self.__exclusion_list) remote_clfiles = self.ls(os.path.basename(local_dir)) ops = self.compare_clfiles(local_dir, local_clfiles, remote_clfiles, delete_files) if self._show_progress: bar = Bar( 'Progress', max=len(ops), suffix= '%(index)d/%(max)d %(percent)d%% [%(elapsed_td)s/%(eta_td)s]') if dry_run is True: common.print_line('performing a dry run. no changes are committed') for op in ops: logging.debug('operation: ' + op.operation + ", path: " + op.src.path) if self._show_progress: bar_title = op.src.name.ljust(25, '.') if len(bar_title) > 25: bar_title = bar_title[0:25] bar.message = 'file:' + bar_title if op.src.is_dir and op.operation != operation.Operation.REMOVE: logging.debug('skipping directory ' + op.src.path) continue if op.operation == operation.Operation.ADD: best_remote = self.get_best_remote(int(op.src.size)) logging.debug('best remote: ' + best_remote) if not self._show_progress: common.print_line('backing up file ' + op.src.path + '/' + op.src.name + ' -> ' + best_remote + ':' + op.src.remote_path) if dry_run is False: self.copy(op.src.path + '/' + op.src.name, op.src.remote_path, best_remote) if op.operation == operation.Operation.UPDATE: best_remote = self.get_best_remote(int(op.src.size)) logging.debug('best remote: ' + best_remote) if not self._show_progress: common.print_line('backing up file ' + op.src.path + '/' + op.src.name + ' -> ' + op.src.remote + ':' + op.src.remote_path) if dry_run is False: self.copy(op.src.path + '/' + op.src.name, op.src.remote_path, op.src.remote) if op.operation == operation.Operation.REMOVE and delete_files is True: if not self._show_progress: common.print_line('removing ' + op.src.remote + op.src.path) if op.src.is_dir: if dry_run is False: try: self.rmdir(op.src.path, op.src.remote) except Exception as e: logging.debug(str(e)) else: if dry_run is False: self.delete_file(op.src.path, op.src.remote) if self._show_progress: bar.next() if self._show_progress: bar.finish()
def load_simple_questions_dataset(config, force_reload=False): bar = Bar(suffix='%(index)d/%(max)d - %(elapsed)ds') data_npz = os.path.join(config.data_dir, 'data.npz') word2idx_txt = os.path.join(config.data_dir, 'word2idx.txt') if (os.path.exists(data_npz) and os.path.exists(word2idx_txt) and not force_reload): bar.max = 2 bar.message = 'Loading npz' bar.next() npz = np.load(data_npz) embd_mat = npz['embd_mat'] train_ques = npz['train_ques'].astype(np.int32) train_ans = npz['train_ans'].astype(np.int32) valid_ques = npz['valid_ques'].astype(np.int32) valid_ans = npz['valid_ans'].astype(np.int32) bar.message = 'Loading word2idx' bar.next() with open(word2idx_txt) as f: reader = csv.reader(f, delimiter='\t') word2idx = {row[0]: int(row[1]) for row in reader} bar.finish() train = train_ques, train_ans valid = valid_ques, valid_ans return train, valid, embd_mat, word2idx bar.max = 8 bar.message = 'Loading GloVe vocab' bar.next() glove_vocab = load_glove_vocab(os.path.join(config.data_dir, 'glove'), '42B', 300) bar.message = 'Loading SimpleQuestions' bar.next() train, valid, dataset_vocab = load_simple_questions(config) bar.message = 'Removing unknown answers' bar.next() train, new_vocab = remove_unknown_answers(train, glove_vocab) dataset_vocab.update(new_vocab) valid, new_vocab = remove_unknown_answers(valid, glove_vocab) dataset_vocab.update(new_vocab) train_q, train_a = train[0], train[1] valid_q, valid_a = valid[0], valid[1] bar.message = 'Replacing unknown tokens' bar.next() unknowns = dataset_vocab - glove_vocab train_q = replace_unknowns(train_q, unknowns) train_a = replace_unknowns(train_a, unknowns) valid_q = replace_unknowns(valid_q, unknowns) valid_a = replace_unknowns(valid_a, unknowns) vocab = dataset_vocab - unknowns bar.message = 'Appending pads' bar.next() max_len = max(len(sent) for sent in train_q + valid_q) train_q = append_pads(train_q, max_len) valid_q = append_pads(valid_q, max_len) vocab.update([TOK_UNK, TOK_PAD]) bar.message = 'Loading GloVe embeddings' bar.next() embd_mat, word2idx = load_glove_embeddings( os.path.join(config.data_dir, 'glove'), '42B', 300, vocab) bar.message = 'Converting token to index' bar.next() train_q = convert_to_idx(train_q, word2idx) train_a = convert_to_idx(train_a, word2idx) valid_q = convert_to_idx(valid_q, word2idx) valid_a = convert_to_idx(valid_a, word2idx) bar.message = 'Saving processed data' bar.next() with open(word2idx_txt, 'w') as f: writer = csv.writer(f, delimiter='\t') writer.writerows(word2idx.items()) data_dict = dict(embd_mat=embd_mat, train_ques=train_q, train_ans=train_a, valid_ques=valid_q, valid_ans=valid_a) np.savez(data_npz, **data_dict) bar.finish() train = np.array(train_q), np.array(train_a) valid = np.array(valid_q), np.array(valid_a) return train, valid, embd_mat, word2idx
def pretty(stream): ''' Read from fifo pipe and output a formatted stream to stdout. ''' progress = None started_tasks = 1 error_messages = {} last_event = None try: while True: for line in iter(stream.readline, ""): event = json.loads(line) if event['tag'] == 'playbook_start': print banner(event['title']) print bcolors.WARNING + "Contains: " + str( json.loads(event['text']) ['plays']) + " Play(s)." + bcolors.ENDC last_event = event['tag'] elif event['tag'] == 'play_start': if progress: progress.next() started_tasks = 1 num_tasks = json.loads(event['text'])['tasks'] if last_event != 'playbook_start': print "\n" + banner(event['title']) else: print banner(event['title']) print bcolors.WARNING + "Contains: " + str( json.loads(event['text']) ['tasks']) + " Task(s)." + bcolors.ENDC + "\n" print "TASK(s):" progress = Bar("Processing...", max=num_tasks, suffix=SUFFIX) progress.update() last_event = event['tag'] elif event['tag'] == 'task_start': if progress: progress.message = event['title'] progress.update() if started_tasks > 1: progress.next() started_tasks = started_tasks + 1 last_event = event['tag'] elif event['tag'] == 'playbook_complete': if progress: progress.next() progress.finish() print banner(event['title']) print banner("RUN Statistics:") print output_statistics(event['text']) if len(error_messages) > 0: print banner("RUN Errors:") print output_errors(error_messages) last_event = event['tag'] elif event['tag'] == 'unreachable': error_messages[event['host']] = event['text'] last_event = event['tag'] except KeyboardInterrupt: stream.flush() stream.close()