def train(net, lr, trainloader, epoch): """ Train SSD @args net: (nn.Module) network lr: (float) learning rate trainloader: (DataLoader) dataloader epoch: (int) training epoch """ net.train() optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=1e-4) criterion = MultiBoxLoss(num_classes=config[args.dataset]['num_classes']+1) progress_bar = ProgressBar(total=len(trainloader)) train_loss = 0 torch.set_printoptions(threshold=10000) for batch_idx, (images, loc_targets, conf_targets) in enumerate(trainloader): images = Variable(images.cuda()) loc_targets = Variable(loc_targets.cuda()) conf_targets = Variable(conf_targets.cuda()) optimizer.zero_grad() loc_preds, conf_preds = net(images) loc_loss, conf_loss, loss = criterion(loc_preds, loc_targets, conf_preds, conf_targets) loss.backward() optimizer.step() writer.add_scalar('train/loss_loc', loc_loss, batch_idx + epoch * len(trainloader)) writer.add_scalar('train/loss_conf', conf_loss, batch_idx + epoch * len(trainloader)) writer.add_scalar('train/loss_total', loss, batch_idx + epoch * len(trainloader)) train_loss += loss.item() progress_bar.move(leftmsg="training epoch " + str(epoch), rightmsg="loss: %.6f" % (train_loss/(batch_idx+1)))
def export_to_xml_in_folder(source, destination=Defaults.MUNIN_XML_FOLDER): """ Calls "rrdtool dump" to convert RRD database files in "source" folder to XML representation Converts all *.rrd files in source folder """ assert os.path.exists(source) try: os.makedirs(destination) except OSError as e: if e.errno != errno.EEXIST: raise filelist = [("", os.path.join(source, file)) for file in os.listdir(source) if file.endswith(".rrd")] nb_files = len(filelist) progress_bar = ProgressBar(nb_files) print "Exporting {0} RRD databases:".format(nb_files) for domain, file in filelist: src = os.path.join(source, domain, file) dst = os.path.join(destination, "{0}-{1}".format(domain, file).replace(".rrd", ".xml")) progress_bar.update() code = subprocess.check_call(['rrdtool', 'dump', src, dst]) return nb_files
def export_to_xml_in_folder(source, destination=Defaults.MUNIN_XML_FOLDER): """ Calls "rrdtool dump" to convert RRD database files in "source" folder to XML representation Converts all *.rrd files in source folder """ assert os.path.exists(source) try: os.makedirs(destination) except OSError as e: if e.errno != errno.EEXIST: raise filelist = [("", os.path.join(source, file)) for file in os.listdir(source) if file.endswith(".rrd")] nb_files = len(filelist) progress_bar = ProgressBar(nb_files) print("Exporting {0} RRD databases:".format(nb_files)) for domain, file in filelist: src = os.path.join(source, domain, file) dst = os.path.join( destination, "{0}-{1}".format(domain, file).replace(".rrd", ".xml")) progress_bar.update() code = subprocess.check_call(['rrdtool', 'dump', src, dst]) return nb_files
def load_images( path, preprocessor=None, limit=None, ): images = [] images_id = next(os.walk(path))[2] size = limit if limit != None else len(images_id) print(f"Loading {size} images") prog = ProgressBar(100, size) for id in range(size): name = images_id[id] filename = path + "/" + name image = load_img(filename, target_size=(224, 224)) image = img_to_array(image) image = image.reshape( (1, image.shape[0], image.shape[1], image.shape[2])) if preprocessor != None: image = preprocessor.preprocess_input(image) image_id = name.split('.')[0] images.append([image_id, image]) prog.update(id) print("Loading complete") return images
def parallel_test(model_cls, model_kwargs, checkpoint, dataset, data_func, gpus, worker_per_gpu=1): ctx = multiprocessing.get_context('spawn') idx_queue = ctx.Queue() result_queue = ctx.Queue() num_workers = len(gpus) * worker_per_gpu workers = [ ctx.Process(target=worker_func, args=(model_cls, model_kwargs, checkpoint, dataset, data_func, gpus[i % len(gpus)], idx_queue, result_queue)) for i in range(num_workers) ] for w in workers: w.daemon = True w.start() for i in range(len(dataset)): idx_queue.put(i) results = {} prog_bar = ProgressBar(task_num=len(dataset)) for _ in range(len(dataset)): img_id, res = result_queue.get() results[img_id] = format_ret(res) prog_bar.update() print('\n') for worker in workers: worker.terminate() return results
def _load(self): log('[{time}] loading from {path}'.format(time=get_time(), path=self._source_path)) for i, label_tag in enumerate(self._label_tags): path = os.path.join(self._source_path, label_tag) files = sample( os.listdir(path)[self._start:self._end], self._max_num ) if self._max_num > 0 else os.listdir(path)[self._start:self._end] print('start: {}, end: {}'.format(self._start, self._end)) print(len(files)) pbar = ProgressBar(len(files)) for j, filename in enumerate(files): filepath = os.path.join(path, filename) try: with open(filepath, 'r') as f: log_sequence = json.load(f) feature = self._sequence2feature(log_sequence) self._data_ids.append( filepath.split('/')[-1].split('.')[0].split('_') [0]) self._feature_data.append(feature) self._label_data.append(i) except: log('[{time}] Failed to load file {filepath}'.format( time=get_time(), filepath=filepath)) print('[{time}] Failed to load file {filepath}'.format( time=get_time(), filepath=filepath)) pbar.updateBar(j)
def generate(self): progress_bar = ProgressBar(self.settings.nb_rrd_files) self.add_header(self.settings) for domain in self.settings.domains: for host in self.settings.domains[domain].hosts: row = self.add_row("{0} / {1}".format(domain, host)) for plugin in self.settings.domains[domain].hosts[ host].plugins: _plugin = self.settings.domains[domain].hosts[ host].plugins[plugin] panel = row.add_panel( _plugin.settings["graph_title"] or plugin, plugin) for field in _plugin.fields: query = panel.add_query(field) if "label" in _plugin.fields[field].settings: query.alias = _plugin.fields[field].settings[ "label"] progress_bar.update() panel.width = 12 // self.settings.grafana['graph_per_row'] panel.process_graph_settings(_plugin.settings) panel.process_graph_thresholds(_plugin.fields) panel.process_graph_types(_plugin.fields)
def _preprocess(self): self.lang = Lang() for text in self._texts: self.lang.index_text(text) for text in self._texts: indexes = indexes_from_text(self.lang, text) indexes.append(EOT_token) padded_indexes = pad_indexes(indexes, self._max_text_length, PAD_token) self._indexed_texts.append(padded_indexes) self._indexed_texts = np.stack(self._indexed_texts, axis=0) bar = ProgressBar(len(self._audio_files) - 1, unit='') for (audio_files_read, audio_file) in enumerate(self._audio_files): # (n_mels, T), (1+n_fft/2, T) mel, mag = compute_spectrograms(audio_file) padded_mel = pad_time_dim(mel, self._max_audio_length, 0) padded_mag = pad_time_dim(mag, self._max_audio_length, 0) self._mels.append(padded_mel.transpose()) self._mags.append(padded_mag.transpose()) bar.update(audio_files_read) self._mels = np.stack(self._mels, axis=0) self._mags = np.stack(self._mags, axis=0)
def discover_from_www(settings): """ Builds a Munin dashboard structure (domain/host/plugins) by reading the HTML files rather than listing the cache folder because the later is likely to contain old data """ # delayed import since this function should not be used in the "normal" case try: from bs4 import BeautifulSoup except ImportError: try: from BeautifulSoup import BeautifulSoup except ImportError: print "Please install BeautifulSoup to use this program" print " pip install beautifulsoup4 or easy_install beautifulsoup4" sys.exit(1) folder = settings.paths['www'] print "Reading Munin www cache: ({0})".format(folder) with open(os.path.join(folder, "index.html")) as f: root = BeautifulSoup(f.read()) domains = root.findAll("span", {"class": "domain"}) # hosts and domains are at the same level in the tree so let's open the file for domain in domains: with open(os.path.join(folder, domain.text, "index.html")) as f: domain_root = BeautifulSoup(f.read()) links = domain_root.find(id="content").findAll("a") progress_bar = ProgressBar(len(links), title=domain.text) for link in links: progress_bar.update() elements = link.get("href").split("/") if len(elements) < 2 \ or elements[0].startswith("..") \ or elements[-1].startswith("index"): continue if len(elements) == 2: host, plugin = elements[0], elements[1] elif len(elements) == 3: # probably a multigraph, we'll be missing the plugin part # we won't bother reading the html file for now and guess it from the RRD database later host, plugin = elements[0], ".".join(elements[1:3]) else: print "Unknown structure" continue plugin = plugin.replace(".html", "") settings.domains[domain.text].hosts[host].plugins[plugin].is_multigraph = (len(elements) == 3) settings.domains[domain.text].hosts[host].plugins[plugin].settings = { 'graph_title': link.text, } settings.nb_plugins += 1 return settings
def train(self, data): N = int(math.ceil(len(data) / self.batch_size)) cost = 0 x = np.ndarray([self.batch_size, self.edim], dtype=np.float32) time = np.ndarray([self.batch_size, self.mem_size], dtype=np.int32) target = np.zeros([self.batch_size, self.nwords]) # one-hot-encoded context = np.ndarray([self.batch_size, self.mem_size]) x.fill(self.init_hid) for t in xrange(self.mem_size): time[:,t].fill(t) if self.show: from utils import ProgressBar bar = ProgressBar('Training', max=N) for idx in xrange(N): if self.show: bar.next() for b in xrange(self.batch_size): m = random.randrange(self.mem_size, len(data)) target[b][data[m]] = 1 context[b] = data[m - self.mem_size:m] loss, self.step = self.sess.run([self.loss, self.global_step], feed_dict={ self.input: x, self.time: time, self.target: target, self.context: context}) cost += loss if self.show: bar.finish() return cost/N/self.batch_size
def plot_traj(trajs, fig_size=(6, 6), color="mediumpurple", size=5, title='', is_plot_line=False, od_only=False, offset=None): """plot the traj """ if offset is None: offset = [0, 0] p = ProgressBar(len(trajs), '绘制轨迹图') plt.figure(figsize=fig_size) for i in range(len(trajs)): p.update(i) traj = np.array(trajs[i]) if od_only: traj = [traj[0], traj[-1]] x = [x[0] + np.random.uniform(-offset[0], offset[0]) for x in traj] y = [y[1] + np.random.uniform(-offset[1], offset[1]) for y in traj] if od_only: if is_plot_line: plt.plot(x[0], y[0], c=color) plt.plot(x[1], y[1], c="yellowgreen") plt.scatter(x[0], y[0], c=color, s=size) plt.scatter(x[1], y[1], c="yellowgreen", s=size) else: if is_plot_line: plt.plot(x, y, c=color) plt.scatter(x, y, c=color, s=size) plt.title(title) plt.show()
def remove_noise(dataset, features, verbose=False): sample_0 = dataset['input'][0] _, _, F = sample_0.shape print 'Removing noise: ', pbar = ProgressBar(len(dataset['input']) * (F - 1)) for f in range(F): if (features[f] == 'time'): continue # no need to filter time for sample in dataset['input']: y = sample[:, 0, f] # compute FT of the feature f w = scipy.fftpack.rfft(y) # compute mean frequency mean = np.mean(np.abs(w)) # set the threshold to double the mean thr = 2 * mean # remove high frequency components cutoff_idx = np.abs(w) < thr w[cutoff_idx] = 0 # return to time domain by doing inverseFFT y = scipy.fftpack.irfft(w) sample[:, 0, f] = y # update progress bar pbar.next() # return return None
def stability_derivatives(self): prog = ProgressBar('Instantiating Stability Object') derivatives = StabilityDerivatives(u=self.initial_trim_case.u, w=self.initial_trim_case.w, q=0, theta_f=self.initial_trim_case.fuselage_tilt, collective_pitch=self.initial_trim_case.collective_pitch, longitudinal_cyclic=self.initial_trim_case.longitudinal_cyclic) prog.update(100) return derivatives
def import_from_xml_folder(self, folder): raise DeprecationWarning # build file list and grouping if necessary file_list = os.listdir(folder) grouped_files = defaultdict(list) errors = [] progress_bar = ProgressBar(len(file_list)) for file in file_list: fullname = os.path.join(folder, file) parts = file.replace(".xml", "").split("-") series_name = ".".join(parts[0:-2]) if self.settings.influxdb['group_fields']: grouped_files[series_name].append((parts[-2], fullname)) else: grouped_files[".".join([series_name, parts[-2]])].append(('value', fullname)) if self.settings.interactive: show = raw_input("Would you like to see the prospective series and columns? y/[n]: ") or "n" if show in ("y", "Y"): for series_name in sorted(grouped_files): print(" - {2}{0}{3}: {1}".format(series_name, [name for name, _ in grouped_files[series_name]], Color.GREEN, Color.CLEAR)) print("Importing {0} XML files".format(len(file_list))) for series_name in grouped_files: data = [] keys_name = ['time'] values = defaultdict(list) for field, file in grouped_files[series_name]: progress_bar.update() keys_name.append(field) content = read_xml_file(file) [values[key].append(value) for key, value in content.items()] # join data with time as first column data.extend([[k]+v for k, v in values.items()]) try: pass # self.upload_values(series_name, keys_name, data) except Exception as e: errors.append(str(e)) continue try: self.validate_record(series_name, keys_name) except Exception as e: errors.append("Validation error in {0}: {1}".format(series_name, e)) if errors: print("The following errors were detected while importing:") for error in errors: print(" {0} {1}".format(Symbol.NOK_RED, error))
def write2files(init_path, file_list, D, write_file_num=14650): """写入文件 """ rand_ind = random.sample([i for i in range(len(file_list))], write_file_num) # 随机抽取轨迹 p2 = ProgressBar(write_file_num, '写入文件') for i in range(write_file_num): p2.update(i) with open(init_path + file_list[rand_ind[i]], 'w') as f2: for step in D[rand_ind[i]]: f2.writelines(str(step[0]) + ',' + str(step[1]) + '\n')
def check_popularity(): people = dbs.query(Person).filter(Person.count == None).all() pb = ProgressBar(people.__len__()) for person in people: pb.update_print(people.index(person)) references = dbs.query( PeopleRel.count).filter(PeopleRel.person == person.name).all() count = sum(i[0] for i in references) person.count = int(count) dbs.commit()
def extract_features(images, model): features = dict() count = 0 prog = ProgressBar(100, len(images)) for id, image in images: feature = model.predict(image, verbose=0) features[id] = feature count += 1 prog.update(count) return features
def markov_model(trajectory, N, epsilon): """basic description detailed description Args: trajectory: 轨迹数据(二维数组) N : 二级网格数 epsilon : 隐私预算 Returns: O_: 中间点转移概率矩阵 """ O_ = np.zeros([N, N]) # 建立N*N的转移概率矩阵 for t in trajectory: O_0 = np.zeros([N, N]) for i in range(len(t) - 1): curr_point = t[i] next_point = t[i + 1] O_0[curr_point][next_point] += 1 O_0 = O_0 / (len(t) - 1) # 该轨迹的转移概率 O_ += O_0 line_all = [] p = ProgressBar(N, '建立中间点转移概率矩阵') for i in range(N): p.update(i) score = 0 for j in range(N): # 添加拉普拉斯噪声 # sensitivity = 1 # randomDouble = random.random() - 0.5 # noise = - (sensitivity / epsilon) * signum(randomDouble) * math.log( # 1 - 2 * abs(randomDouble)) noise = np.random.laplace(0, 1 / epsilon) # noise = 0.00000000000000000000000001 O_[i][j] += noise if O_[i][j] < 0: O_[i][j] = 0 score += O_[i][j] line_all.append(score) # compute X,归一 for i in range(N): O_[i] /= line_all[i] sns.heatmap(data=O_, square=True) plt.show() return O_
def one_run(projects_train, projects_test, K, outlier_threshold, granularity): rmse_failed_run = [] rmse_success_run = [] rmse_run = [] accuracy_run = [] relative_time = np.linspace(0.025, 1, 20) bar = ProgressBar(end_value=len(relative_time), text="Time steps", count=True) bar.start() for i, rel_t in enumerate(relative_time): # Data t = int(rel_t * 999) samples = subsample(t, granularity) t = len(samples) T = 999 # Remove outliers projects_train_filtered = [p for p in projects_train if np.all((p.money[T] - outlier_threshold) <= 0) and np.all((p.money[samples] - outlier_threshold) <= 0)] projects_test_filtered = [p for p in projects_test if np.all((p.money[T] - outlier_threshold) <= 0) and np.all((p.money[samples] - outlier_threshold) <= 0)] X_train = np.ndarray(shape=(len(projects_train_filtered), t), buffer=np.array([p.money[samples] for p in projects_train_filtered]), dtype=float) y_train = np.expand_dims(np.array([p.money[T] for p in projects_train_filtered]), axis=1) X_test = np.ndarray(shape=(len(projects_test_filtered), t), buffer=np.array([p.money[samples] for p in projects_test_filtered]), dtype=float) y_test = np.expand_dims(np.array([p.money[T] for p in projects_test_filtered]), axis=1) #X_max = np.max(X_train, axis=0) #X_train = X_train / X_max[np.newaxis, :] #X_test = X_test / X_max[np.newaxis, :] # Hyperparameters beta = 0.0001 epsilon = 1e0 lam = 0 iterations = 50 random_restarts = None mls = LeastSquaresMixture(X_train, y_train, K=K, beta=beta, lam=lam, iterations=iterations, epsilon=epsilon, random_restarts=random_restarts) mls.train(verbose=False) #print(mls) rmse_failed, rmse_success, rmse, accuracy = mls.evaluate(X_test, y_test, verbose=False) rmse_failed_run.append(rmse_failed) rmse_success_run.append(rmse_success) rmse_run.append(rmse) accuracy_run.append(accuracy) bar.update(i) print(accuracy_run) return rmse_failed_run, rmse_success_run, rmse_run, accuracy_run
def get_most_common(a1, a2): temp_dict1 = {} temp_dict2 = {} pb = ProgressBar(worksum=len(a1), auto_display=False) pb.startjob() num = 0 for s1, s2 in zip(a1, a2): num += 1 pb.complete(1) if args.max_words != -1 and (len(s1) > args.max_words or len(s2) > args.max_words): continue for w1 in s1: temp_dict1.setdefault(w1, 0) temp_dict1[w1] += 1 for w2 in s2: temp_dict2.setdefault(w2, 0) temp_dict2[w2] += 1 if num % 32 == 0: pb.display_progress_bar() sorted1 = sorted(temp_dict1.items(), key=lambda i: i[1], reverse=True) sorted2 = sorted(temp_dict2.items(), key=lambda i: i[1], reverse=True) #print(sorted1[:100]) #print(sorted2[:100]) return [i[0] for i in sorted1[:args.vac_dict_ch] ], [i[0] for i in sorted2[:args.vac_dict_en]]
def train(self, train, validation, num_epochs=None, learning_rate=0.01, threshold=0.001): """Train the FFNN with gradient descent. Dynamic stopping on lowest validation error. Training runs over the given number of epochs. If None are given, then training runs until the threshold (change in validation error) is reached over multiple consecutive iterations. This dynamic stopping also occurs if validation error begins to increase. When dynamic stopping is used, the network finalizes the best weights found of the duration of training. """ num_epochs_iter = num_epochs if num_epochs else 600 # 600 set to max epochs dynamic_stopping = False if num_epochs else True # Dynamically halt if num_epochs unspec. retries = 0 err = self.evaluate(validation) progress_bar = ProgressBar() for epoch in range(num_epochs_iter): last_err = err for i in range(len(train)): progress_bar.refresh(i / len(train)) sample = choice(train) # Randomly sample training data # Update weights based on the chosen sample self.prepare_network() self.propagate_input(sample.features) self.propagate_error(sample.label) self.update_weights(sample, learning_rate, momentum=0.3) progress_bar.refresh(1.0) progress_bar.clear() # Evaluate validation error err = self.evaluate(validation) print('Epoch {} validation error: {:.4f}'.format(epoch, err)) if dynamic_stopping: if last_err - err < threshold: if err <= last_err: # Still improved, but below threshold self.save_network_weights(err) retries += 1 if retries >= 100: epochs_ran = epoch break else: self.save_network_weights(err) retries = 0 else: epochs_ran = num_epochs_iter # Loop did not stop early if dynamic_stopping: self.finalize_network_weights( ) # Finalize weights to best validation error return epochs_ran
def run(self, mode, X, Y, batch_size, optimizer=None, clip=None): self.reset_states(batch_size) if optimizer: self.train(True) else: self.eval() nbatches = X.size(0) // batch_size pb = ProgressBar(mode, self.epoch, nbatches) _total_time = 0 _total_loss = 0 _total_word = 0 L = nn.CrossEntropyLoss(size_average=False) for index in range(nbatches): begin = index * batch_size end = begin + batch_size # Start if optimizer: start = time.time() x = Variable(X[begin:end], requires_grad=False) t = Variable(Y[begin:end], requires_grad=False) else: start = time.time() x = Variable(X[begin:end], requires_grad=False, volatile=True) t = Variable(Y[begin:end], requires_grad=False, volatile=True) y = self(x) loss = L(y, t.view(-1)) if optimizer: if clip: torch.nn.utils.clip_grad_norm(self.parameters(), clip) self.zero_grad() loss.backward() optimizer.step() # End time_per_batch = time.time() - start _total_time += time_per_batch _total_loss += loss.cpu().data.numpy()[0] _total_word += float(numpy.prod(t.size())) pb.update([('ppl', numpy.exp(_total_loss / _total_word), lambda x: x), ('wps', _total_word / _total_time, lambda x: x)]) print return numpy.exp(_total_loss / _total_word), _total_word / _total_time
def __init__(self, args): self.initial_lr = args.learning_rate self.lr = args.learning_rate self.test_only = args.test_only self.dump_statistics = args.dump_statistics self.modelName = args.model self.experiment = args.experiment self.log_path = args.log_path self.save_path = args.save_path if not os.path.isdir(self.log_path): os.makedirs(self.log_path) self.logger = Logger( '%s/%s_%s.csv' % (self.log_path, self.modelName, args.experiment), 'epoch, time, learning_rate, tr_loss, tr_acc, val_loss, val_acc') self.progress_bar = ProgressBar() self.chrono = Chrono() self.trainset, self.testset, self.trainloader, self.testloader = dataloader( ) print('==> Building model..') self.ae = AutoEncoder() self.model = getattr(models, self.modelName)() if self.modelName == 'bit': self.model.load_from( numpy.load('./state_dicts/%s.npz' % self.modelName)) if torch.cuda.is_available(): self.ae = torch.nn.DataParallel(self.ae) self.model = torch.nn.DataParallel(self.model) torch.backends.cudnn.benchmark = True self.optimizer = torch.optim.SGD(self.model.parameters(), lr=self.lr, momentum=0.9) self.load_ae() if args.resume or self.test_only or self.dump_statistics: self.load() self.criterion = torch.nn.CrossEntropyLoss() self.criterion = get_torch_vars(self.criterion, False) self.ae = get_torch_vars(self.ae, False) self.model = get_torch_vars(self.model, False)
def get_all_ranking(save_to): from utils import ProgressBar fout = open(save_to, 'w') images = Image.objects.all() progress = ProgressBar(len(images) * len(images), 20) for target in images: searcher = Searcher(target) searcher.run() results = [] for _, image in searcher.results: results.append((image.origin_id, len(results))) progress.update() results.sort() print >> fout, ' '.join(str(x) for _, x in results) print('Finished. Written to file "{}"'.format(save_to))
def test(test_data_loader, model): srocc = SROCC() plcc = PLCC() rmse = RMSE() len_test = len(test_data_loader) pb = ProgressBar(len_test, show_step=True) print("Testing") model.eval() with torch.no_grad(): for i, ((img, ref), score) in enumerate(test_data_loader): img, ref = img.cuda(), ref.cuda() output = model(img, ref).cpu().data.numpy() score = score.data.numpy() srocc.update(score, output) plcc.update(score, output) rmse.update(score, output) pb.show( i, "Test: [{0:5d}/{1:5d}]\t" "Score: {2:.4f}\t" "Label: {3:.4f}".format(i + 1, len_test, float(output), float(score))) print("\n\nSROCC: {0:.4f}\n" "PLCC: {1:.4f}\n" "RMSE: {2:.4f}".format(srocc.compute(), plcc.compute(), rmse.compute()))
def train(train_loader, model, criterion, optimizer, epoch): losses = AverageMeter() len_train = len(train_loader) pb = ProgressBar(len_train-1) print("Training") # Switch to train mode model.train() criterion.cuda() for i, ((img,ref), score) in enumerate(train_loader): img, ref, score = img.cuda(), ref.cuda(), score.squeeze().cuda() # Compute output output = model(img, ref) loss = criterion(output, score) # Measure accuracy and record loss losses.update(loss.data, img.shape[0]) # Compute gradient and do SGD step optimizer.zero_grad() loss.backward() optimizer.step() pb.show(i, '[{0:5d}/{1:5d}]\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' .format(i, len_train, loss=losses))
def create_examples(self, lines, example_type, cached_file, save_cache): ''' Creates examples for data ''' label_list = self.get_labels() if cached_file and cached_file.exists(): logger.info("Loading examples from cached file %s", cached_file) examples = torch.load(cached_file) else: pbar = ProgressBar(n_total=len(lines), desc='create examples') examples = [] for i, line in enumerate(lines): #if i>20:break # for quik debug guid = '%s-%d' % (example_type, i) label = line['tags'] text_a = line['info'] text_b = None match = line["cira_match"] if self.test_mode == 4 and sum(match) < 4: continue else: examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, match=match)) pbar(step=i) if save_cache: logger.info("Saving examples into cached file %s", cached_file) torch.save(examples, cached_file) return examples
def validate(val_loader, model, criterion, show_step=False): losses = AverageMeter() srocc = SROCC() len_val = len(val_loader) pb = ProgressBar(len_val-1, show_step=show_step) print("Validation") # Switch to evaluate mode model.eval() with torch.no_grad(): for i, ((img,ref), score) in enumerate(val_loader): img, ref, score = img.cuda(), ref.cuda(), score.squeeze().cuda() # Compute output output = model(img, ref) loss = criterion(output, score) losses.update(loss.data, img.shape[0]) output = output.cpu().data score = score.cpu().data srocc.update(score.numpy(), output.numpy()) pb.show(i, '[{0:5d}/{1:5d}]\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Output {out:.4f}\t' 'Target {tar:.4f}\t' .format(i, len_val, loss=losses, out=output, tar=score)) return float(1.0-srocc.compute()) # losses.avg
def _write_per_dir(self): """ write outputs of each directory to a different file """ bag_of_words = BagOfWordsParser(self._words_filter) reviews_per_dir = defaultdict(list) flags = Reviewer._get_flags_text(self._output_per_dir, self._output_in_svm_light, self._output_in_tfdidf) for dir in map(os.path.abspath, self._dirs): numoffiles = len(os.listdir(dir)) filename = Reviewer._get_filename(dir, flags) reviews = reviews_per_dir[filename] prefix = "crunching reviews for '{}'".format(filename) with ProgressBar(prefix, numoffiles) as pb: for review in bag_of_words.parse_dir(dir): pb.report() reviews.append(review) assert numoffiles == len(reviews) # we update here because the bag of words is not updated yet # when we're still reading the files in the directories for filename, reviews in reviews_per_dir.items(): self._write(filename, bag_of_words, reviews)
def trip_distribution(trajectory, N, epsilon): """ 获取行程分布 Args: trajectory: 轨迹数据(二维数组) N : 二级网格数 epsilon : 隐私预算 Returns: R: 转移概率矩阵 """ R = np.zeros((N, N)) # 每个格子建立转移概率矩阵 for t in trajectory: if len(t) > 1: sta = t[0] end = t[-1] R[sta][end] += 1 else: print(t) count = np.sum(R) print(count) p = ProgressBar(N, '建立转移概率矩阵') for i in range(N): p.update(i) for j in range(N): # 添加拉普拉斯噪声 # sensitivity = 1 # randomDouble = random.random() - 0.5 # noise = - (sensitivity / epsilon) * signum(randomDouble) * math.log( # 1 - 2 * abs(randomDouble)) noise = np.random.laplace(0, 1 / epsilon) R[i][j] += noise if R[i][j] < 0: R[i][j] = 0 # 是否计算加完噪声后的|D|, 存疑 # count += R[i][j] R /= count return R
def markov_model(trajectory, N, epsilon): """ 马尔可夫模型 Args: trajectory: 轨迹数据(二维数组) N : 二级网格数 epsilon : 隐私预算 Returns: O_: 中间点转移概率矩阵 """ O_ = np.zeros((N, N)) # 建立 N*N 的转移概率矩阵 for t in trajectory: O_sub = np.zeros((N, N)) for i in range(len(t) - 1): curr_point = t[i] next_point = t[i + 1] O_sub[curr_point][next_point] += 1 O_sub /= (len(t) - 1) # 该轨迹的转移概率 O_ += O_sub p = ProgressBar(N, '生成中间点转移概率矩阵') for i in range(N): p.update(i) for j in range(N): noise = np.random.laplace(0, 1 / epsilon) # 添加拉普拉斯噪声 O_[i][j] += noise if O_[i][j] < 0: O_[i][j] = 0 # compute X row_sum = [sum(O_[i]) for i in range(N)] for j in range(N): O_[j] /= row_sum[j] # 绘制矩阵热力图 sns.heatmap(data=O_, square=True) plt.title('mobility model construction matrix (epsilon=%s)' % str(used_pair[0])) plt.show() return O_
def spectrogram2wav(spectrogram): ''' spectrogram: [t, f], i.e. [t, nfft // 2 + 1] ''' spectrogram = spectrogram.T # [f, t] X_best = copy.deepcopy(spectrogram) # [f, t] bar = ProgressBar(hp.n_iter, unit='') for i in range(hp.n_iter): bar.update(i) X_t = invert_spectrogram(X_best) est = librosa.stft( X_t, hp.n_fft, hp.hop_length, win_length=hp.win_length) # [f, t] phase = est / np.maximum(1e-8, np.abs(est)) # [f, t] X_best = spectrogram * phase # [f, t] X_t = invert_spectrogram(X_best) return np.real(X_t)
def predict(f_enc, f_dec, samples, batches, mat, max_len, header): ''' Sample words and compute the prediction error ''' preds = [] errs = [] progress = ProgressBar(numpy.sum([len(batch) for batch in batches]), 20, header) for batch in batches: x, mask_x, y, mask_y = load_batch(samples, batch, mat) [prev_h] = f_enc(x, mask_x) n_steps = mask_x.sum(0) n_samples = x.shape[1] sents = numpy.zeros((n_samples, max_len), 'int32') # First step - No embedded word is fed into the decoder sents[:, 0], prev_h = f_dec(numpy.asarray([-1] * n_samples, 'int32'), prev_h) n_ends = n_steps - (sents[:, 0] == 0) for i in range(1, max_len - 1): prev_words = sents[:, i - 1] if not n_ends.any(): break next_words, prev_h = f_dec(prev_words, prev_h) sents[:, i] = next_words * (n_ends > 0) n_ends -= (next_words == 0) * (n_ends > 0) for i in range(n_samples): idx = 0 while idx < max_len and n_steps[i] > 0: if sents[i, idx] == 0: n_steps[i] -= 1 idx += 1 preds.append(sents[i, :idx].tolist()) y = numpy.concatenate( [y, numpy.zeros((max_len - len(y), n_samples), 'int32')]).T mask_y = numpy.concatenate( [mask_y, numpy.zeros((max_len - len(mask_y), n_samples), 'int32')]).T errs.extend(((sents != y) * mask_y * 1.).sum(1) / mask_y.sum(1)) progress.disp(errs, ' ERR') return preds, numpy.mean(errs)
def evaluate(epoch, model, val_loader, criterion, log_path): model.eval() val_progressor = ProgressBar(log_path, mode="Val ", epoch=epoch, total_epoch=config.epochs, model_name=config.model_name, total=len(val_loader)) losses = AverageMeter() top1 = AverageMeter() with torch.no_grad(): for index, (data, label) in enumerate(val_loader): val_progressor.current = index data = Variable(data).cuda() label = Variable(torch.from_numpy(np.asarray(label))).cuda() output = model(data) loss = criterion(output, label) p_top1, p_top2 = accuracy(output, label, topk=(1, 2)) losses.update(loss.item(), data.size(0)) top1.update(p_top1[0], data.size(0)) val_progressor.current_loss = losses.avg val_progressor.current_top1 = top1.avg val_progressor() #print('epoch %d validate iteration %d: loss: %.3f' % (epoch + 1, index + 1, it_loss.data)) #correct += (output == label).sum() val_progressor.done() return losses.avg, top1.avg
def train(self, data): N = int(math.ceil(len(data) / self.batch_size)) cost = 0 x = np.ndarray([self.batch_size, self.edim], dtype=np.float32) # batch_size * internal_state_dimension time = np.ndarray([self.batch_size, self.mem_size], dtype=np.int32) target = np.zeros([self.batch_size, self.nwords]) # one-hot-encoded context = np.ndarray([self.batch_size, self.mem_size]) # 128 * 100 x.fill(self.init_hid) for t in range(self.mem_size): time[:,t].fill(t) ''' time = array([[ 0, 1, 2, ..., 97, 98, 99], ..., [ 0, 1, 2, ..., 97, 98, 99]], dtype=int32) 128 * 100 ''' if self.show: from utils import ProgressBar bar = ProgressBar('Train', max=N) for idx in range(N): if self.show: bar.next() target.fill(0) for b in range(self.batch_size): # generate a randome number for 100 to the length of data m = random.randrange(self.mem_size, len(data)) # for this batch b, the target data[m] is set to be one target[b][data[m]] = 1 # the context is range from (m - self.mem_size) to m context[b] = data[m - self.mem_size:m] _, loss, self.step = self.sess.run([self.optim, self.loss, self.global_step], feed_dict={ self.input: x, self.time: time, self.target: target, self.context: context}) cost += np.sum(loss) if self.show: bar.finish() return cost/N/self.batch_size
def evaluate(self, samples): """Evaluate a set of samples using RMSE.""" ssq_error = 0 progress_bar = ProgressBar() for i, sample in enumerate(samples): progress_bar.refresh(i / len(samples)) ssq_error += self.sq_error(sample) progress_bar.refresh(1.0) progress_bar.clear() return sqrt(ssq_error / len(samples))
def train(self, data): n_batch = int(math.ceil(len(data) / self.batch_size)) cost = 0 u = np.ndarray([self.batch_size, self.edim], dtype=np.float32) # (N, 150) Will fill with 0.1 T = np.ndarray([self.batch_size, self.mem_size], dtype=np.int32) # (N, 100) Will fill with 0..99 target = np.zeros([self.batch_size, self.nwords]) # one-hot-encoded sentences = np.ndarray([self.batch_size, self.mem_size]) u.fill(self.init_u) # (N, 150) Fill with 0.1 since we do not need query in the language model. for t in range(self.mem_size): # (N, 100) 100 memory cell with 0 to 99 time sequence. T[:,t].fill(t) if self.show: from utils import ProgressBar bar = ProgressBar('Train', max=n_batch) for idx in range(n_batch): if self.show: bar.next() target.fill(0) # (128, 10,000) for b in range(self.batch_size): # We random pick a word in our data and use that as the word we need to predict using the language model. m = random.randrange(self.mem_size, len(data)) target[b][data[m]] = 1 # Set the one hot vector for the target word to 1 # (N, 100). Say we pick word 1000, we then fill the memory using words 1000-150 ... 999 # We fill Xi (sentence) with 1 single word according to the word order in data. sentences[b] = data[m - self.mem_size:m] _, loss, self.step = self.sess.run([self.optim, self.loss, self.global_step], feed_dict={ self.u: u, self.T: T, self.target: target, self.sentences: sentences}) cost += np.sum(loss) if self.show: bar.finish() return cost/n_batch/self.batch_size
def export_to_xml(settings): progress_bar = ProgressBar(settings.nb_rrd_files) try: os.makedirs(settings.paths['xml']) except OSError as e: if e.errno != errno.EEXIST: raise for domain, host, plugin, field in settings.iter_fields(): _field = settings.domains[domain].hosts[host].plugins[plugin].fields[field] if _field.rrd_found: progress_bar.update() code = subprocess.check_call(['rrdtool', 'dump', _field.rrd_filename, _field.xml_filename]) if code == 0: _field.rrd_exported = True return progress_bar.current
def predict(f_enc, f_dec, samples, batches, mat, max_len, header): ''' Sample words and compute the prediction error ''' preds = [] errs = [] progress = ProgressBar(numpy.sum([len(batch) for batch in batches]), 20, header) for batch in batches: x, mask_x, y, mask_y = load_batch(samples, batch, mat) [prev_h] = f_enc(x, mask_x) n_steps = mask_x.sum(0) n_samples = x.shape[1] sents = numpy.zeros((n_samples, max_len), 'int32') # First step - No embedded word is fed into the decoder sents[:, 0], prev_h = f_dec(numpy.asarray([-1] * n_samples, 'int32'), prev_h) n_ends = n_steps - (sents[:, 0] == 0) for i in range(1, max_len - 1): prev_words = sents[:, i - 1] if not n_ends.any(): break next_words, prev_h = f_dec(prev_words, prev_h) sents[:, i] = next_words * (n_ends > 0) n_ends -= (next_words == 0) * (n_ends > 0) for i in range(n_samples): idx = 0 while idx < max_len and n_steps[i] > 0: if sents[i, idx] == 0: n_steps[i] -= 1 idx += 1 preds.append(sents[i, : idx].tolist()) y = numpy.concatenate([y, numpy.zeros((max_len - len(y), n_samples), 'int32')]).T mask_y = numpy.concatenate([mask_y, numpy.zeros((max_len - len(mask_y), n_samples), 'int32')]).T errs.extend(((sents != y) * mask_y * 1.).sum(1) / mask_y.sum(1)) progress.disp(errs, ' ERR') return preds, numpy.mean(errs)
async def quote_many(num_quotes=1, conn_limit=20, progress=None, step=10): if progress is None: progress = ProgressBar() progress.max = num_quotes // step logger.info('Process total %d quotes with max %d concurrent connections' % (num_quotes, conn_limit)) logger.debug('... progress bar increment step size: %d coroutines' % step) semaphore = asyncio.Semaphore(conn_limit) coro_to_fut = asyncio.ensure_future futures = [ coro_to_fut(quote_with_lock(semaphore)) for i in range(num_quotes) ] t_start = datetime.today() for ith, fut in enumerate(asyncio.as_completed(futures), 1): if ith % step == 0: progress.next() await fut t_end = datetime.today() progress.finish() logger.info('All coroutines complete in {:.2f} seconds'.format( (t_end - t_start).total_seconds() )) quotes = [fut.result() for fut in futures] return quotes
def gen_words(self, data, dummy_idx, N): # N = int(math.ceil(len(data) / self.batch_size)) data = data.copy() x = np.ndarray([self.batch_size, self.edim], dtype=np.float32) time = np.ndarray([self.batch_size, self.mem_size], dtype=np.int32) target = np.zeros([self.batch_size, self.nwords]) # one-hot-encoded context = np.ndarray([self.batch_size, self.mem_size]) x.fill(self.init_hid) for t in range(self.mem_size): time[:, t].fill(t) if self.show: from utils import ProgressBar bar = ProgressBar('Generating', max=N) for idx in range(N): if self.show: bar.next() for n in range(N): context = np.zeros([self.batch_size, self.mem_size]) + dummy_idx min_len = min(len(data), self.mem_size) context[:, -min_len:] = data[-min_len:] prediction = self.sess.run(self.output, feed_dict={ self.input: x, self.time: time, self.target: target, self.context: context}) predicted_word_index = np.argmax(prediction[0]) print(predicted_word_index) data = np.append(data, predicted_word_index) if self.show: bar.finish() return data
def generate(self): progress_bar = ProgressBar(self.settings.nb_rrd_files) self.add_header(self.settings) for domain in self.settings.domains: for host in self.settings.domains[domain].hosts: row = self.add_row("{0} / {1}".format(domain, host)) for plugin in self.settings.domains[domain].hosts[host].plugins: _plugin = self.settings.domains[domain].hosts[host].plugins[plugin] panel = row.add_panel(_plugin.settings["graph_title"] or plugin, plugin) for field in _plugin.fields: query = panel.add_query(field) if "label" in _plugin.fields[field].settings: query.alias = _plugin.fields[field].settings["label"] progress_bar.update() panel.width = 12//self.settings.grafana['graph_per_row'] panel.process_graph_settings(_plugin.settings) panel.process_graph_thresholds(_plugin.fields) panel.process_graph_types(_plugin.fields)
def test(self, data, label='Test'): n_batch = int(math.ceil(len(data) / self.batch_size)) cost = 0 u = np.ndarray([self.batch_size, self.edim], dtype=np.float32) T = np.ndarray([self.batch_size, self.mem_size], dtype=np.int32) target = np.zeros([self.batch_size, self.nwords]) # one-hot-encoded sentences = np.ndarray([self.batch_size, self.mem_size]) u.fill(self.init_u) for t in range(self.mem_size): T[:,t].fill(t) if self.show: from utils import ProgressBar bar = ProgressBar(label, max=n_batch) m = self.mem_size for idx in range(n_batch): if self.show: bar.next() target.fill(0) for b in range(self.batch_size): target[b][data[m]] = 1 sentences[b] = data[m - self.mem_size:m] m += 1 if m >= len(data): m = self.mem_size loss = self.sess.run([self.loss], feed_dict={self.u: u, self.T: T, self.target: target, self.sentences: sentences}) cost += np.sum(loss) if self.show: bar.finish() return cost/n_batch/self.batch_size
def test(self, data, label='Test'): N = int(math.ceil(len(data) / self.batch_size)) cost = 0 x = np.ndarray([self.batch_size, self.edim], dtype=np.float32) time = np.ndarray([self.batch_size, self.mem_size], dtype=np.int32) target = np.zeros([self.batch_size, self.nwords]) # one-hot-encoded context = np.ndarray([self.batch_size, self.mem_size]) x.fill(self.init_hid) for t in range(self.mem_size): time[:, t].fill(t) if self.show: from utils import ProgressBar bar = ProgressBar(label, max=N) m = self.mem_size for idx in range(N): if self.show: bar.next() target.fill(0) for b in range(self.batch_size): target[b][data[m]] = 1 context[b] = data[m - self.mem_size:m] m += 1 if m >= len(data): m = self.mem_size loss = self.sess.run([self.loss], feed_dict={self.input: x, self.time: time, self.target: target, self.context: context}) cost += np.sum(loss) if self.show: bar.finish() return cost/N/self.batch_size
def train(self, data): N = int(math.ceil(len(data) / self.batch_size)) # math.ceil : returns smallest integer not less than x. cost = 0 x = np.ndarray([self.batch_size, self.edim], dtype=np.float32) time = np.ndarray([self.batch_size, self.mem_size], dtype=np.int32) target = np.zeros([self.batch_size, self.nwords]) # one-hot-encoded context = np.ndarray([self.batch_size, self.mem_size]) x.fill(self.init_hid) # 초기화 QA 테스크와는 달리 질문이 없기 때문에 0.1로 된 상수 벡터로 고정한다(embedding도 x) for t in range(self.mem_size): time[:,t].fill(t) # [[0,1,2,3,4,...,mem_size] ... ] if self.show: from utils import ProgressBar bar = ProgressBar('Train', max=N) for idx in range(N): if self.show: bar.next() target.fill(0) for b in range(self.batch_size): m = random.randrange(self.mem_size, len(data)) # 100~ x 에서 하나를 가져와서.. target[b][data[m]] = 1 # 타겟을 랜덤으로 고르는건가? context[b] = data[m - self.mem_size:m] # 그 단어의 100번째 전 단어까지를 context로 사용 _, loss, self.step = self.sess.run([self.optim, self.loss, self.global_step], feed_dict={ self.input: x, # 0.1로 고정된 벡터 self.time: time, # temporal encoding을 위한 memory slot lookup용 self.target: target, # one-hot encoding된 101번째 예측되는 단어 self.context: context}) # 그 전의 100개의 단어 cost += np.sum(loss) if self.show: bar.finish() return cost/N/self.batch_size
async def save_profiles(names): conn = aiohttp.TCPConnector(limit=50, verify_ssl=False) with aiohttp.ClientSession(connector=conn) as session: ps = [Profile(name, session) for name in names] futures = [asyncio.ensure_future(p.get_info()) for p in ps] futures += [asyncio.ensure_future(p.get_publications()) for p in ps] progress, step = ProgressBar(), 10 progress.max = len(futures) // step for i, future in enumerate(asyncio.as_completed(futures), 1): if i % step == 0: progress.next() await future progress.finish() return [future.result() for future in futures]
def quote_many(num_quotes=1, conn_limit=20, progress=None, step=10): if progress is None: progress = ProgressBar() progress.max = num_quotes // step logger.info('Process total %d quotes with max %d concurrent connections' % (num_quotes, conn_limit)) logger.debug('... progress bar increment step size: %d coroutines' % step) semaphore = asyncio.Semaphore(conn_limit) # wrap coroutines with future # For Python 3.4.4+, asyncio.ensure_future(...) # will wrap coro as Task and keep input the same # if it is already Future. try: coro_to_fut = asyncio.ensure_future except AttributeError: logger.warning('asyncio.ensure_future requires Python 3.4.4+. ' 'Fall back to asyncio.async') coro_to_fut = asyncio.async futures = [ coro_to_fut(quote_with_lock(semaphore)) for i in range(num_quotes) ] t_start = datetime.today() for ith, fut in enumerate(asyncio.as_completed(futures), 1): if ith % step == 0: progress.next() yield from fut t_end = datetime.today() progress.finish() logger.info('All coroutines complete in {:.2f} seconds'.format( (t_end - t_start).total_seconds() )) quotes = [fut.result() for fut in futures] return quotes
def predict(f_enc, f_dec, samples, batches, mat, beam_size, max_len, header): ''' Sample words and compute the prediction error ''' preds = [] errs = [] progress = ProgressBar(numpy.sum([len(batch) for batch in batches]), 20, header) for batch in batches: x, mask_x, y, mask_y = load_batch(samples, batch, mat) [init_h] = f_enc(x, mask_x) n_steps = mask_x.sum(0) n_samples = x.shape[1] prev_sents = numpy.zeros((beam_size, n_samples, max_len), 'int32') # First step - No embedded word is fed into the decoder prev_words = numpy.asarray([-1] * n_samples, 'int32') prev_sents[:, :, 0], prev_log_prob, prev_h = f_dec(prev_words, init_h) prev_h = numpy.tile(prev_h, (beam_size, 1, 1)) prev_n_ends = n_steps - (prev_sents[:, :, 0] == 0) for i in range(1, max_len - 1): hypo_sents = [[]] * n_samples hypo_log_prob = [[]] * n_samples hypo_h = [[]] * n_samples hypo_n_ends = [[]] * n_samples has_hypos = numpy.asarray([False] * n_samples) for j in range(beam_size): if not prev_n_ends[j].any(): continue next_words, next_log_prob, next_h = f_dec(prev_sents[j, :, i - 1], prev_h[j]) for k in range(n_samples): if prev_n_ends[j, k] > 0: has_hypos[k] = True next_sents = numpy.tile(prev_sents[j, k], (beam_size, 1)) next_sents[:, i] = next_words[:, k] hypo_sents[k].extend(next_sents) hypo_log_prob[k].extend(next_log_prob[:, k] + prev_log_prob[j, k]) hypo_h[k].extend([next_h[k]] * beam_size) hypo_n_ends[k].extend(prev_n_ends[j, k] - (next_words[:, k] == 0)) else: hypo_sents[k].append(prev_sents[j, k].copy()) hypo_log_prob[k].append(prev_log_prob[j, k]) hypo_h[k].append(prev_h[j, k].copy()) hypo_n_ends[k].append(0) if not has_hypos.any(): break for j in range(n_samples): if not has_hypos[j]: continue indices = numpy.argsort(hypo_log_prob[j])[: -beam_size - 1: -1] for k in range(beam_size): prev_sents[k, j] = hypo_sents[j][indices[k]] prev_log_prob[k, j] = hypo_log_prob[j][indices[k]] prev_h[k, j] = hypo_h[j][indices[k]] prev_n_ends[k, j] = hypo_n_ends[j][indices[k]] sents = prev_sents[prev_log_prob.argmax(0), numpy.arange(n_samples)] for i in range(n_samples): idx = 0 while idx < max_len and n_steps[i] > 0: if sents[i, idx] == 0: n_steps[i] -= 1 idx += 1 preds.append(sents[i, : idx].tolist()) y = numpy.concatenate([y, numpy.zeros((max_len - len(y), n_samples), 'int32')]).T mask_y = numpy.concatenate([mask_y, numpy.zeros((max_len - len(mask_y), n_samples), 'int32')]).T errs.extend(((sents != y) * mask_y * 1.).sum(1) / mask_y.sum(1)) progress.disp(errs, ' ERR') return preds, numpy.mean(errs)
def main( # Dataset Configuration path_train='../train.json', # Path to load training set path_val='../val.json', # Path to load validation set path_mat_train='../VGG19_train.npy', # Path of image features of training set path_mat_val='../VGG19_val.npy', # Path of image features of validation set max_samples_train=0, # Max number of samples in training set max_samples_val=0, # Max number of samples in validation set # Model Configuration n_dim_img=4096, # Dimension of image feature n_dim_txt=250, # Dimension of word embedding n_dim_enc=1000, # Number of hidden units in encoder n_dim_dec=1000, # Number of hidden units in decoder batch_size_train=64, # Batch size in training batch_size_test=64, # Batch size in validation optimizer=adadelta, # [sgd|adam|adadelta|rmsprop], sgd not recommanded lrate=0.0002, # Learning rate for optimizer max_epochs=1000, # Maximum number of epoch to run patience=10, # Number of epoch to wait before early stop if no progress # Frequency ratio_val=1., # Validation frequency - Validate model after trained by this ratio of data ratio_save=1., # Save frequency - Save the best parameters after trained by this ratio of data # Save & Load path_load=None, # Path to load a previouly trained model path_save='model', # Path to save the models ): ''' Main function ''' print('Loading data...') n_dim_vocab = 0 # Vocabulary size samples_train, mat_train, n_dim_vocab = load_data(path_train, path_mat_train, n_dim_vocab, max_samples_train) samples_val, mat_val, n_dim_vocab = load_data(path_val, path_mat_val, n_dim_vocab, max_samples_val) print('\ttraining: %6d samples' % len(samples_train)) print('\tvalidation: %6d samples' % len(samples_val)) t_params = OrderedDict() best_params = None costs = [] if path_load: best_params = OrderedDict(numpy.load(path_load)) costs.extend(best_params['costs']) del best_params['costs'] init_t_params(best_params, t_params) print('Building model...') f_cost, f_update = build_model(t_params, n_dim_img, n_dim_txt, n_dim_enc, n_dim_dec, n_dim_vocab, optimizer) print('Training...') time_start = time.time() batches_val = get_batches(len(samples_val), batch_size_test) n_epochs = 0 n_samples = 0 n_bad_costs = 0 n_stops = 0 next_val = ratio_val * len(samples_train) next_save = max(ratio_save * len(samples_train), next_val) while n_epochs < max_epochs: n_epochs += 1 batches_train = get_batches(len(samples_train), batch_size_train, True) pgb_train = ProgressBar(len(batches_train), 20, 'EPOCH %4d ' % n_epochs) costs_train = [] for batch_train in batches_train: n_samples += len(batch_train) get_cost(f_cost, samples_train, batch_train, mat_train, costs_train, pgb_train, f_update, lrate) if n_samples >= next_val: next_val += ratio_val * len(samples_train) pgb_train.pause() pgb_val = ProgressBar(len(batches_val), 20, 'VALIDATION ') costs_val = [] for batch_val in batches_val: get_cost(f_cost, samples_val, batch_val, mat_val, costs_val, pgb_val) costs.append(numpy.mean(costs_val)) if best_params is None or costs[-1] <= numpy.min(costs): best_params = params_unzip(t_params) n_bad_costs = 0 else: n_bad_costs += 1 if n_bad_costs > patience: n_stops += 1 print('WARNING: early stop for %d time(s)!' % n_stops) params_zip(best_params, t_params) n_bad_costs = 0 if path_save and n_samples >= next_save: next_save = max(next_save + ratio_save * len(samples_train), next_val) pgb_train.pause() print('Saving model...') if best_params is not None: params = best_params else: params = params_unzip(t_params) numpy.savez(path_save, costs=costs, **params) numpy.savez('%s_%f' % (path_save, costs_train[-1]), costs=costs, **params_unzip(t_params)) time_end = time.time() print('Training finished') print('TIME: %9.3f sec EPOCHS: %4d SPEED: %9.3f sec/epoch' % (time_end - time_start, n_epochs, (time_end - time_start) / n_epochs)) if best_params is not None: params_zip(best_params, t_params) else: best_params = params_unzip(t_params) print('Saving final model...') if path_save: numpy.savez(path_save, costs=costs, **best_params) print('Done.')
def import_from_xml(self): print("\nUploading data to InfluxDB:") progress_bar = ProgressBar(self.settings.nb_rrd_files*3) # nb_files * (read + upload + validate) errors = [] def _upload_and_validate(measurement, tags, fields, packed_values): try: self.write_series(measurement, tags, fields, packed_values) except Exception as e: errors.append((Symbol.NOK_RED, "Error writing {0} to InfluxDB: {1}".format(measurement, e))) return finally: progress_bar.update(len(fields)-1) # 'time' column ignored try: self.validate_record(measurement, fields) except Exception as e: errors.append((Symbol.WARN_YELLOW, "Validation error in {0}: {1}".format(measurement, e))) finally: progress_bar.update(len(fields)-1) # 'time' column ignored try: assert self.client and self.valid except: raise Exception("Not connected to a InfluxDB server") else: print(" {0} Connection to database \"{1}\" OK".format(Symbol.OK_GREEN, self.settings.influxdb['database'])) if self.settings.influxdb['group_fields']: """ In "group_fields" mode, all fields of a same plugin (ex: system, user, nice, idle... of CPU usage) will be represented as columns of the same time series in InfluxDB. Schema will be: +----------------------+-------+----------+----------+-----------+ | time_series_name | col_0 | col_1 | col_2 | col_3 ... | +----------------------+-------+----------+----------+-----------+ | domain.host.plugin | time | metric_1 | metric_2 | metric_3 | | acadis.org.tesla.cpu | time | system | user | nice | | ... | | | | | +----------------------+-------+----------+----------+-----------+ """ for domain, host, plugin in self.settings.iter_plugins(): _plugin = self.settings.domains[domain].hosts[host].plugins[plugin] measurement = plugin tags = { "domain": domain, "host": host, "plugin": plugin } if _plugin.is_multigraph: tags["is_multigraph"] = True print(host, plugin) field_names = ['time'] values = defaultdict(list) values_with_time = [] for field in _plugin.fields: _field = _plugin.fields[field] if _field.rrd_exported: field_names.append(field) try: content = read_xml_file(_field.xml_filename) except Exception as e: errors.append((Symbol.WARN_YELLOW, "Could not read file for {0}: {1}".format(field, e))) else: [values[key].append(value) for key, value in content.items()] # keep track of influxdb storage info to allow 'fetch' _field.influxdb_measurement = measurement _field.influxdb_field = field _field.xml_imported = True # update progress bar [###### ] 42 % progress_bar.update() # join data with time as first column values_with_time.extend([[k]+v for k, v in values.items()]) _upload_and_validate(measurement, tags, field_names, values_with_time) else: # non grouping """ In "non grouped" mode, all fields of a same plugin will have a dedicated time series and the values will be written to a 'value' column Schema will be: +-----------------------------+-------+-------+ | time_series_name | col_0 | col_1 | +-----------------------------+-------+-------+ | domain.host.plugin.metric_1 | time | value | | domain.host.plugin.metric_2 | time | value | | acadis.org.tesla.cpu.system | time | value | | ... | | | +-----------------------------+-------+-------+ """ for domain, host, plugin, field in self.settings.iter_fields(): _field = self.settings.domains[domain].hosts[host].plugins[plugin].fields[field] if not _field.rrd_exported: continue measurement = field tags = { "domain": domain, "host": host, "plugin": plugin } field_names = ['time', 'value'] values = defaultdict(list) values_with_time = [] _field.influxdb_measurement = measurement _field.influxdb_field = 'value' content = read_xml_file(_field.xml_filename) [values[key].append(value) for key, value in content.items()] _field.xml_imported = True progress_bar.update() # join data with time as first column values_with_time.extend([[k]+v for k, v in values.items()]) _upload_and_validate(measurement, tags, field_names, values_with_time) for error in errors: print(" {} {}".format(error[0], error[1]))
def discover_from_rrd(settings, insert_missing=True, print_missing=False): """ Builds a Munin dashboard structure (domain/host/plugins) by listing the files in the RRD folder http://munin-monitoring.org/wiki/MuninFileNames: /var/lib/munin/SomeGroup/foo.example.com-cpu-irq-d.rrd --------- --------------- --- --- - | | | | `-- Data type (a = absolute, c = counter, d = derive, g = gauge) | | | `----- Field name / data source: 'irq' | | `--------- Plugin name: 'cpu' | `------------------- Node name: 'foo.example.com' `-------------------------------- Group name: 'SomeGroup' """ folder = settings.paths['munin'] print "Reading Munin RRD cache: ({0})".format(folder) not_inserted = defaultdict(dict) for domain in os.listdir(folder): if not os.path.isdir(os.path.join(folder, domain)): #domains are represented as folders continue if not insert_missing and not domain in settings.domains: #skip unknown domains (probably no longer wanted) continue files = os.listdir(os.path.join(folder, domain)) progress_bar = ProgressBar(len(files), title=domain) for filename in files: progress_bar.update() path = os.path.join(folder, domain, filename) if os.path.isdir(path) or not path.endswith(".rrd"): # not a RRD database continue parts = os.path.splitext(filename)[0].split('-') length = len(parts) if(length < 4): print "Error:", filename, parts, length continue host, plugin, field, datatype = parts[0], ".".join(parts[1:-2]), parts[-2], parts[-1] if not insert_missing and (not host in settings.domains[domain].hosts or not plugin in settings.domains[domain].hosts[host].plugins): if not host in not_inserted[domain]: not_inserted[domain][host] = set() not_inserted[domain][host].add(plugin) continue plugin_data = settings.domains[domain].hosts[host].plugins[plugin] try: assert os.path.exists(os.path.join(folder, domain, "{0}-{1}-{2}-{3}.rrd".format(host, plugin.replace(".", "-"), field, datatype[0]))) except AssertionError: print "{0} != {1}-{2}-{3}-{4}.rrd".format(filename, host, plugin, field, datatype[0]) plugin_data.fields[field].rrd_found = False else: plugin_data.fields[field].rrd_found = True plugin_data.fields[field].rrd_filename = os.path.join(settings.paths['munin'], domain, filename) plugin_data.fields[field].xml_filename = os.path.join(settings.paths['xml'], domain, filename.replace(".rrd", ".xml")) plugin_data.fields[field].settings = { "type": DATA_TYPES[datatype] } settings.nb_fields += 1 if print_missing and len(not_inserted): print "The following plugin databases were ignored" for domain, hosts in not_inserted.items(): print " - Domain {0}:".format(domain) for host, plugins in hosts.items(): print " {0} Host {1}: {2}".format(Symbol.NOK_RED, host, ", ".join(plugins)) return settings