def wrapper(*args, **kwargs): global conn cur = None if 'cursor' not in kwargs: cur = conn.cursor(cursor_factory=psycopg2.extras.NamedTupleCursor) kwargs['cursor'] = cur log = loggers.Logger('QueryErrors') while True: try: query = foo(*args, **kwargs) if cur is not None: kwargs['cursor'].close() break except (psycopg2.OperationalError, psycopg2.InterfaceError, psycopg2.DatabaseError) as e1: try: log.error(f"Error during query {e1}", exc_info=True) kwargs['cursor'].close() conn.close() conn = db_connect() kwargs['cursor'] = conn.cursor(cursor_factory=psycopg2.extras.NamedTupleCursor) except Exception as e2: log.error(f"Error during db_connect: {e2}", exc_info=True) time.sleep(60) # Время не понятно каким должно быть except Exception as e3: log.error(f'Unhandled exception in wrapper {e3}', exc_info=True) raise e3 return query
def __init__(self, model, optimizer, sampling_min, batch_size, lr_sched, num_classes): ## Hardcoded params device = 'cuda' if torch.cuda.is_available() else 'cpu' sb_start_epoch = 1 log_interval = 1 sampling_max = 1 # Params for resuming from checkpoint start_epoch = 0 start_num_backpropped = 0 start_num_skipped = 0 probability_calculator = selectors.SelectProbabiltyCalculator( sampling_min, sampling_max, num_classes, device, square=False, translate=False) final_selector = selectors.DeterministicSamplingSelector( probability_calculator, initial_sum=1) final_backpropper = backproppers.SamplingBackpropper( device, model, optimizer) self.selector = selectors.PrimedSelector(selectors.BaselineSelector(), final_selector, sb_start_epoch, epoch=start_epoch) self.backpropper = backproppers.PrimedBackpropper( backproppers.BaselineBackpropper(device, model, optimizer), final_backpropper, sb_start_epoch, epoch=start_epoch) self.trainer = trainer.Trainer(device, model, self.selector, self.backpropper, batch_size, lr_schedule=lr_sched) self.logger = loggers.Logger(log_interval=log_interval, epoch=start_epoch, num_backpropped=start_num_backpropped, num_skipped=start_num_skipped) self.trainer.on_forward_pass(self.logger.handle_forward_batch) self.trainer.on_backward_pass(self.logger.handle_backward_batch)
def __init__(self, model, optimizer, prob_pow, batch_size, lr_sched, num_classes, num_training_images, forwardlr, strategy, calculator="relative", fp_selector_type="alwayson", staleness=2): ## Hardcoded params device = 'cuda' if torch.cuda.is_available() else 'cpu' assert device == "cuda" self.num_training_images = num_training_images num_images_to_prime = self.num_training_images #num_images_to_prime = 0 log_interval = 1 sampling_min = 0 sampling_max = 1 max_history_len = 1024 prob_loss_fn = nn.CrossEntropyLoss loss_fn = nn.CrossEntropyLoss sample_size = 0 # only needed for topk, lowk # Params for resuming from checkpoint start_epoch = 0 start_num_backpropped = 0 start_num_skipped = 0 self.selector = None self.fp_selector = None if strategy == "nofilter": self.backpropper = backproppers.SamplingBackpropper( device, model, optimizer, loss_fn) self.trainer = trainer.NoFilterTrainer(device, model, self.backpropper, batch_size, loss_fn, lr_schedule=lr_sched, forwardlr=forwardlr) else: probability_calculator = calculators.get_probability_calculator( calculator, device, prob_loss_fn, sampling_min, sampling_max, num_classes, max_history_len, prob_pow) self.selector = selectors.get_selector("sampling", probability_calculator, num_images_to_prime, sample_size) self.fp_selector = fp_selectors.get_selector(fp_selector_type, num_images_to_prime, staleness=staleness) self.backpropper = backproppers.SamplingBackpropper( device, model, optimizer, loss_fn) self.trainer = trainer.MemoizedTrainer(device, model, self.selector, self.fp_selector, self.backpropper, batch_size, loss_fn, lr_schedule=lr_sched, forwardlr=forwardlr) self.logger = loggers.Logger(log_interval=log_interval, epoch=start_epoch, num_backpropped=start_num_backpropped, num_skipped=start_num_skipped, start_time_seconds=start_time_seconds) self.trainer.on_backward_pass(self.logger.handle_backward_batch) self.trainer.on_forward_pass(self.logger.handle_forward_batch)
def get_address(tid, _OS, ssh=None, cursor=None): try_exc(os.mkdir, f"{windows.projects}{tid}") logger = loggers.Logger(tid, address=H_OS.projects + str(tid) + H_OS.slash) cursor.execute(f'SELECT project, file, params, uid FROM tasks WHERE id = {tid}') row = foolproof(cursor.fetchone()) uid = row.uid param = json.loads(row.params) path = _OS.r_path + 'files' + _OS.slash + str(row.project) + _OS.slash # prefix\tasks.project\ cursor.execute(f'SELECT fname, parent, pid, version FROM files WHERE id = {row.file}') # tasks.file row = foolproof(cursor.fetchone()) pid = row.pid if ssh: try_exc(ssh.run, f'{_OS.mkdir} "{_OS.r_path}files{_OS.slash}{pid}"') if row.version: # row.version is not None in blend project only if row.version not in BLD_V: logger.critical(f'Unknown or unsupported blender file version: {row.version}') param['v'] = BLD_V['282'] else: param['v'] = BLD_V[row.version] cursor.execute(f"UPDATE tasks SET params='{json.dumps(param)}'::json WHERE id={tid}") conn.commit() temp_path = row.fname # contains .skp filename # print(temp_path) while row.parent: cursor.execute(f'SELECT fname, parent FROM files WHERE id = {row.parent}') row = cursor.fetchone() temp_path = row.fname + windows.slash + temp_path # folder\filename.skp projectfile = path + temp_path cursor.execute(f'SELECT * FROM files WHERE pid={pid} and host_location IS NOT NULL and size !=0 and status = 1') files = cursor.fetchall() for file in files: time_out = 600 if list(filter(file.host_location.endswith, ['.blend', '.skp', '.max', '.vrscene'])) and _OS.r_path + file.host_location.replace( _OS.antislash, _OS.slash) != path + temp_path: continue logger.debug(f'File row: {file}') logger.debug(windows.r_path + file.host_location.replace('/', windows.slash)) while time_out > 0: if os.access((windows.r_path + file.host_location).replace('/', windows.slash), os.F_OK): break time_out -= 3 time.sleep(3) logger.debug( f'No {windows.r_path}{file.host_location} file on disk. Waiting'.replace('/', windows.slash)) else: # Нужно кидать эксепшен на первый же ненайденный файл logger.critical(f'No {windows.r_path}{file.host_location} file on disk'.replace('/', windows.slash)) temp_frames_ban(tid, cursor=cursor) frame_splits_set_ban(tid, cursor=cursor) status_update(tid, tid, H_OS, status=15, cursor=cursor) raise TimeoutError logger.debug( f"Its size:{os.path.getsize((windows.r_path + file.host_location).replace('/', windows.slash))}") time_out = 300 while time_out > 0: if os.path.getsize((windows.r_path + file.host_location).replace('/', windows.slash)) == file.size: # send if ssh: dir_adr = (_OS.r_path + file.host_location).replace(_OS.antislash, _OS.slash).rsplit(_OS.slash, 1)[ 0] if (path + temp_path).replace(_OS.antislash, _OS.slash).rsplit(_OS.slash, 1)[0] != dir_adr: try_exc(ssh.run, _OS.mkdir, dir_adr) logger.debug(f'{_OS.mkdir} "{dir_adr}"') remdir = (_OS.put_path + file.host_location).replace(_OS.antislash, _OS.slash) if file.host_location.endswith('.max'): remdir = remdir.rsplit(windows.slash, 1)[0] + windows.slash + str(file.id) + '.max' projectfile = projectfile.rsplit(windows.slash, 1)[0] + windows.slash + str(file.id) + '.max' ssh.put((windows.r_path + file.host_location).replace(windows.antislash, windows.slash), remote=remdir) break logger.debug( f'Size of {windows.r_path}{file.host_location} has not matched. Waiting'.replace('/', windows.slash)) time_out -= 1 time.sleep(2) else: logger.critical( f'Size of {windows.r_path}{file.host_location} has not matched.'.replace('/', windows.slash)) temp_frames_ban(tid) status_update(tid, tid, H_OS, status=15) raise TimeoutError logger.debug('All files are OK') return projectfile, param, uid
def server_check(task, cursor=None, servers='3,5,12,16,17,18'): logger = loggers.Logger(task.task, address=H_OS.projects + str(task.task) + H_OS.slash) cursor.execute(f'SELECT server_level FROM temp_frames where tid={task.task} and server_level=128') is_srv_128 = cursor.fetchone() # Проверяет нодность сцены while True: """Опрос серверов и заполнение словарей""" cursor.execute(f'SELECT busy, ip, level FROM servers WHERE id IN ({servers}) ORDER BY id') row = cursor.fetchall() for srv in SRV: SRV[srv][1] = sum([row[SRV_LIST.index(i)].busy for i in SRV[srv][0]]) for srv in win_dict: win_dict[srv][1] = row[SRV_LIST.index(win_dict[srv][0])].busy for srv in lin_dict: lin_dict[srv][1] = row[SRV_LIST.index(lin_dict[srv][0])].busy """Виндовый blender и 3dsMAX обрабатываются одинаково""" if task.render == 20 or task.render == 1: for srv in win_dict: if win_dict[srv][1] == 0: update_server_status(SRV_LIST[srv], logger, status=1, cursor=cursor) frame_splits_update(task.id, is_done=-1, cursor=cursor) return srv, task.id, task.render, '64,128' return None, None, None, '64,128' elif task.render == 21: """Линуксовый blender""" for srv in lin_dict: if lin_dict[srv][1] == 0: update_server_status(SRV_LIST[srv], logger, status=1, cursor=cursor) frame_splits_update(task.id, is_done=-1, cursor=cursor) return srv, task.id, task.render, '64,128' return None, None, None, '64,128' elif task.render == 4 or task.render == 5: """vrcene files from SU and solo""" if is_srv_128 and not SRV[4][1]: # Вначале проверяет на 128нод и занимает SRV[4] как единсвенный вариант update_server_status(SRV[4][0], logger, status=1, cursor=cursor) frame_splits_update(task.id, is_done=-1, tariff=1.7, cursor=cursor) return 4, task.id, task.render, '64,128' elif is_srv_128: # Если серверы заняты то инициирует новый поиск сцены на 64 нода return None, None, None, '64' else: for srv in SRV: if SRV[srv][1] == 0: # Сцены на 64 нода успевают захватить сервера при переключении между 128мыми сценами # Поэтому проверяет наличие в очереди старшей сцены на 128нод перед захватом сервера # Можно проверять нодность верхней сцены, но интересней сравнить tid - он будет отличаться, # если сцена лезет без очереди task2 = scene_check(cursor=cursor) tariff = None # По-умолчанию tariff=1, и чтобы так и осталось, нужно передавать None в frame_splits_update if task.task != task2.task and not SRV[4][1]: srv = 4 tariff = 1.7 task = task2 update_server_status(SRV[srv][0], logger, status=1, cursor=cursor) frame_splits_update(task.id, is_done=-1, tariff=tariff, cursor=cursor) return srv, task.id, task.render, '64,128' else: return None, None, None, '64,128'
def status_update(tid, name, _os, encoding='utf-8', status=0, cursor=None): logger = loggers.Logger(name, encoding=encoding, address=_os.projects + str(tid) + _os.slash) cursor.execute(f'UPDATE tasks SET status={status} WHERE id={tid}') cursor.execute(f'select name from statuses where id={status}') logger.debug(f'Status updated for tid:{tid}, set: {cursor.fetchone().name}') conn.commit()
def experiment(n_classes, bilinear, dropout, learning_rate, gamma, alpha, data_path, val_percent, batch_size, rep_num, logfile, augment, date): settings = locals().copy() # Make results folder for specific parameters savepath = 'results/{}/g{}_alp{}_bs{}_lr{}_bilin{}_do{}_aug{}'.format( date, gamma, alpha, batch_size, learning_rate, bilinear, dropout, augment) if not os.path.exists(savepath): os.makedirs(savepath) if logfile != 'none': try: logger = loggers.Logger('{}/{}'.format(savepath, logfile)) except loggers.LogAlreadyExistsError as e: print(e.message) return logger.connect() print('Program: {}'.format(sys.argv[0])) print('Command line: {}'.format(' '.join(sys.argv))) print('Settings:') print(', '.join( ['{}={}'.format(k, settings[k]) for k in sorted(settings.keys())])) best_accuracy = float('inf') # LOAD MODEL input_channels = 5 print('LOAD VGG16 UNET MODEL') torch_device = torch.device( 'cuda:0' if torch.cuda.is_available() else 'cpu') model = vgg16_UNet(models.vgg16(), input_channels, n_classes, bilinear, dropout).to(torch_device) model.encoder.expand_input(input_channels) optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) criterion = FocalLoss(gamma=gamma, alpha=alpha) print('Selecting dataset from training islands...') trainset_X, valset_X, trainset_y, valset_y = new_data_splitter( data_path, test_island, island_refs, observer_list, val_percent) train_X = ImageAccessor(trainset_X, load_image_vgg16) val_X = ImageAccessor(valset_X, load_image_vgg16) train_y = ImageAccessor(trainset_y, load_tgt_vgg16) val_y = ImageAccessor(valset_y, load_tgt_vgg16) trainset = data_source.ArrayDataSource([train_X, train_y]) valset = data_source.ArrayDataSource([val_X, val_y]) # Augment trainset = trainset.map(augment_batch) valset = valset.map(augment_batch) pool = work_pool.WorkerThreadPool(4) trainset = pool.parallel_data_source(trainset) valset = pool.parallel_data_source(valset) data_loaders = {'train': trainset, 'val': valset} print('BEGIN TRAINING...') total_train_loss = [] total_val_loss = [] no_improvement = 0 epoch = 0 while no_improvement < 10: t1 = time.time() print('-' * 10) # Train and validate train_loss = train(model, optimizer, data_loaders, criterion, batch_size, torch_device, 'train') val_loss = train(model, optimizer, data_loaders, criterion, batch_size, torch_device, 'val') t2 = time.time() print( 'Epoch {} took {:.3f}s; training loss = {:.6f}; validation loss = {:.6f}' .format(epoch, t2 - t1, train_loss, val_loss)) # Save losses total_train_loss.append(train_loss) total_val_loss.append(val_loss) # Check loss and save checkpoint is_best = bool(val_loss < best_accuracy) if is_best: no_improvement = 0 # reset the counter after new best found else: no_improvement += 1 # count how many non-improvements print('Current best loss: {}'.format(best_accuracy)) best_accuracy = min(val_loss, best_accuracy) print('{}/checkpoint.pth.tar'.format(savepath)) save_checkpoint( { 'epoch': epoch, 'state_dict': model.state_dict(), 'best_accuracy': torch.FloatTensor([best_accuracy]) }, is_best, '{}/checkpoint.pth.tar'.format(savepath)) print('No improvement in loss for {} epochs'.format(no_improvement)) if epoch == 5: print('Setting new learning rate...') optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate / 10.) epoch += 1 np.savez('{}/TrainingLoss.npz'.format(savepath), train_loss=total_train_loss, val_loss=total_val_loss) return
folder, __, files = next(g) fantasy_zip = zipfile.ZipFile(f"{H_OS.projects}{_tid}{H_OS.slash}archive.zip", 'w') not_empty = False for file in files: if (file.endswith(".png") or file.endswith(".PNG") or file.endswith(".jpg") or file.endswith(".JPG") or file.endswith(".exr") or file.endswith(".EXR")): # поддержать выбор формата not_empty = True fantasy_zip.write(os.path.join(folder, file), file, compress_type=zipfile.ZIP_DEFLATED) fantasy_zip.close() if not_empty: crl = pycurl.Curl() crl.setopt(crl.URL, f'https://winrender.com/upload_blender.php?id={_tid}') crl.setopt(pycurl.SSL_VERIFYPEER, 0) crl.setopt(pycurl.SSL_VERIFYHOST, 0) crl.setopt(crl.HTTPPOST, [ ('file', ( crl.FORM_FILE, f"{H_OS.projects}{_tid}{H_OS.slash}archive.zip", )), ]) result = crl.perform_rs() logger.info(f'{H_OS.projects}{_tid}{H_OS.slash}archive.zip sent:{result}') crl.close() else: os.remove(f"{H_OS.projects}{_tid}{H_OS.slash}archive.zip") if __name__ == '__main__': logger = loggers.Logger(sys.argv[1], address=H_OS.projects + sys.argv[1] + H_OS.slash) uploader(sys.argv[1])
def __init__(self, model, optimizer, prob_pow, batch_size, lr_sched, num_classes, num_training_images, forwardlr, strategy, kath_oversampling_rate, calculator="relative", fp_selector_type="alwayson", staleness=2, spline_y1=None, spline_y2=None, spline_y3=None): ## Hardcoded params device = 'cuda' if torch.cuda.is_available() else 'cpu' assert device == "cuda" self.num_training_images = num_training_images num_images_to_prime = self.num_training_images log_interval = 1 bias_batch_log_interval = 1000 sampling_min = 0 sampling_max = 1 max_history_len = 1024 prob_loss_fn = nn.CrossEntropyLoss loss_fn = nn.CrossEntropyLoss sample_size = 0 # only needed for kath, topk, lowk # Params for resuming from checkpoint start_epoch = 0 start_num_backpropped = 0 start_num_skipped = 0 kath_oversampling_rate = 4 self.selector = None self.fp_selector = None self.bias_logger = None if strategy == "kath": self.selector = None self.backpropper = backproppers.SamplingBackpropper(device, model, optimizer, loss_fn) self.trainer = trainer.KathTrainer(device, model, self.backpropper, batch_size, int(batch_size * kath_oversampling_rate), loss_fn, lr_schedule=lr_sched, forwardlr=forwardlr) elif strategy == "nofilter": self.backpropper = backproppers.SamplingBackpropper(device, model, optimizer, loss_fn) self.trainer = trainer.NoFilterTrainer(device, model, self.backpropper, batch_size, loss_fn, lr_schedule=lr_sched, forwardlr=forwardlr) elif strategy == "logbias": probability_calculator = calculators.get_probability_calculator(calculator, device, prob_loss_fn, sampling_min, sampling_max, num_classes, max_history_len, prob_pow, spline_y1, spline_y2, spline_y3) self.selector = selectors.get_selector("sampling", probability_calculator, num_images_to_prime, sample_size) self.fp_selector = fp_selectors.get_selector("alwayson", num_images_to_prime, staleness=staleness) self.backpropper = backproppers.GradientAndSelectivityLoggingBackpropper(device, model, optimizer, loss_fn, 10, bias_batch_log_interval) self.trainer = trainer.MemoizedTrainer(device, model, self.selector, self.fp_selector, self.backpropper, batch_size, loss_fn, lr_schedule=lr_sched, forwardlr=forwardlr) self.bias_logger = loggers.BiasByEpochLogger("/tmp", "test", bias_batch_log_interval) self.trainer.on_backward_pass(self.bias_logger.handle_backward_batch) else: probability_calculator = calculators.get_probability_calculator(calculator, device, prob_loss_fn, sampling_min, sampling_max, num_classes, max_history_len, prob_pow, spline_y1, spline_y2, spline_y3) self.selector = selectors.get_selector("sampling", probability_calculator, num_images_to_prime, sample_size) self.fp_selector = fp_selectors.get_selector(fp_selector_type, num_images_to_prime, staleness=staleness) self.backpropper = backproppers.SamplingBackpropper(device, model, optimizer, loss_fn) self.trainer = trainer.MemoizedTrainer(device, model, self.selector, self.fp_selector, self.backpropper, batch_size, loss_fn, lr_schedule=lr_sched, forwardlr=forwardlr) self.logger = loggers.Logger(log_interval = log_interval, epoch=start_epoch, num_backpropped=start_num_backpropped, num_skipped=start_num_skipped, start_time_seconds = start_time_seconds) self.trainer.on_backward_pass(self.logger.handle_backward_batch) self.trainer.on_forward_pass(self.logger.handle_forward_batch)
from dbmod import * from su2019 import su_func import loggers if __name__ == '__main__': subprocess.Popen(["python", f"{PROJECT_DIR}SplitManager.py"]) while True: task_row = None while not task_row: task_row = task_monitor() time.sleep(4) """Got task""" try_exc(os.mkdir, f"{windows.projects}{task_row.tid}") logger = loggers.Logger(task_row.tid, address=H_OS.projects + str(task_row.tid) + H_OS.slash) logger.info(f"Got task:\n{task_row}") """Unsupported renderer""" if task_row.render not in (20, 21, 4, 5, 1): temp_frames_ban(task_row.tid) status_update(task_row.tid, task_row.tid, H_OS, status=13) logger.warning( f"Task {task_row.tid} gets temp_frames.is_done=2, cause its render={task_row.render}" ) continue """Supported renderer""" status_update(task_row.tid, task_row.tid, H_OS, status=2) temp_frames_ban(task_row.tid, is_done=-1) """Blender""" if task_row.render == 20 or task_row.render == 21: