def main(): db = vmdatabase.VMDatabase() machine = db.getMachineByJenkinsName(NODE_NAME) if machine.state != vmdatabase.HOLD: machine.state = vmdatabase.DELETE try: utils.update_stats(machine.base_image.provider) if UPSTREAM_BUILD_URL: fd = urllib.urlopen(UPSTREAM_BUILD_URL+'api/json') data = json.load(fd) result = data['result'] if statsd and result == 'SUCCESS': dt = int(data['duration']) key = 'devstack.job.%s' % UPSTREAM_JOB_NAME statsd.timing(key+'.runtime', dt) statsd.incr(key+'.builds') key += '.%s' % UPSTREAM_BRANCH statsd.timing(key+'.runtime', dt) statsd.incr(key+'.builds') key += '.%s' % machine.base_image.provider.name statsd.timing(key+'.runtime', dt) statsd.incr(key+'.builds') except: print "Error getting build information" traceback.print_exc()
def main(): db = vmdatabase.VMDatabase() if not SKIP_DEVSTACK_GATE_JENKINS: config = ConfigParser.ConfigParser() config.read(DEVSTACK_GATE_SECURE_CONFIG) jenkins = myjenkins.Jenkins(config.get('jenkins', 'server'), config.get('jenkins', 'user'), config.get('jenkins', 'apikey')) jenkins.get_info() else: jenkins = None provider = db.getProvider(PROVIDER_NAME) print "Working with provider %s" % provider.name error = False for machine in provider.machines: if machine.state != vmdatabase.READY: continue print 'Checking machine', machine.name try: check_machine(jenkins, machine) except: error = True traceback.print_exc() utils.update_stats(provider) if error: sys.exit(1)
def main(): db = vmdatabase.VMDatabase() machine = db.getMachineByJenkinsName(NODE_NAME) if machine.state != vmdatabase.HOLD: machine.state = vmdatabase.DELETE utils.update_stats(machine.base_image.provider)
def main(): db = vmdatabase.VMDatabase() config = ConfigParser.ConfigParser() config.read(DEVSTACK_GATE_SECURE_CONFIG) if not SKIP_DEVSTACK_GATE_JENKINS: jenkins = myjenkins.Jenkins(config.get('jenkins', 'server'), config.get('jenkins', 'user'), config.get('jenkins', 'apikey')) jenkins.get_info() else: jenkins = None try: machine = db.getMachineByJenkinsName(NODE_NAME) except Exception: utils.log.debug("Unable to find node: %s" % NODE_NAME) return utils.log.debug("Used ID: %s old state: %s build:%s" % ( machine.id, machine.state, BUILD_URL)) machine.state = vmdatabase.USED if jenkins: if machine.jenkins_name: if jenkins.node_exists(machine.jenkins_name): config = jenkins.get_node_config(machine.jenkins_name) old = None m = LABEL_RE.search(config) if m: old = m.group(1) config = LABEL_RE.sub('<label>devstack-used</label>', config) for i in range(3): try: jenkins.reconfig_node(machine.jenkins_name, config) except: if i == 2: utils.log.exception( "Unable to relabel ID: %s" % machine.id) raise time.sleep(5) utils.log.debug( "Relabeled ID: %s old label: %s new label: %s" % ( machine.id, old, 'devstack-used')) utils.update_stats(machine.base_image.provider)
def main(): db = vmdatabase.VMDatabase() config = ConfigParser.ConfigParser() config.read(DEVSTACK_GATE_SECURE_CONFIG) if not SKIP_DEVSTACK_GATE_JENKINS: jenkins = myjenkins.Jenkins(config.get('jenkins', 'server'), config.get('jenkins', 'user'), config.get('jenkins', 'apikey')) jenkins.get_info() else: jenkins = None try: machine = db.getMachineByJenkinsName(NODE_NAME) except Exception: utils.log.debug("Unable to find node: %s" % NODE_NAME) return utils.log.debug("Used ID: %s old state: %s build:%s" % (machine.id, machine.state, BUILD_URL)) machine.state = vmdatabase.USED if jenkins: if machine.jenkins_name: if jenkins.node_exists(machine.jenkins_name): config = jenkins.get_node_config(machine.jenkins_name) old = None m = LABEL_RE.search(config) if m: old = m.group(1) config = LABEL_RE.sub('<label>devstack-used</label>', config) for i in range(3): try: jenkins.reconfig_node(machine.jenkins_name, config) except: if i == 2: utils.log.exception("Unable to relabel ID: %s" % machine.id) raise time.sleep(5) utils.log.debug( "Relabeled ID: %s old label: %s new label: %s" % (machine.id, old, 'devstack-used')) utils.update_stats(machine.base_image.provider)
def main(): db = vmdatabase.VMDatabase() try: machine = db.getMachineByJenkinsName(NODE_NAME) except Exception: utils.log.debug("Unable to find node: %s" % NODE_NAME) return if machine.state != vmdatabase.HOLD: utils.log.debug("Set deleted ID: %s old state: %s build: %s" % ( machine.id, machine.state, BUILD_URL)) machine.state = vmdatabase.DELETE else: utils.log.debug("Hold ID: %s old state: %s build: %s" % ( machine.id, machine.state, BUILD_URL)) try: utils.update_stats(machine.base_image.provider) if UPSTREAM_BUILD_URL: fd = urllib.urlopen(UPSTREAM_BUILD_URL + 'api/json') data = json.load(fd) result = data['result'] if statsd and result == 'SUCCESS': dt = int(data['duration']) key = 'devstack.job.%s' % UPSTREAM_JOB_NAME statsd.timing(key + '.runtime', dt) statsd.incr(key + '.builds') key += '.%s' % UPSTREAM_BRANCH statsd.timing(key + '.runtime', dt) statsd.incr(key + '.builds') key += '.%s' % machine.base_image.provider.name statsd.timing(key + '.runtime', dt) statsd.incr(key + '.builds') except: print "Error getting build information" traceback.print_exc()
def main(): db = vmdatabase.VMDatabase() config = ConfigParser.ConfigParser() config.read(DEVSTACK_GATE_SECURE_CONFIG) jenkins = myjenkins.Jenkins(config.get('jenkins', 'server'), config.get('jenkins', 'user'), config.get('jenkins', 'apikey')) jenkins.get_info() machine = db.getMachineByJenkinsName(NODE_NAME) machine.state = vmdatabase.USED if machine.jenkins_name: if jenkins.node_exists(machine.jenkins_name): config = jenkins.get_node_config(machine.jenkins_name) config = LABEL_RE.sub('<label>devstack-used</label>', config) jenkins.reconfig_node(machine.jenkins_name, config) utils.update_stats(machine.base_image.provider)
def main(): db = vmdatabase.VMDatabase() if not SKIP_DEVSTACK_GATE_JENKINS: config = ConfigParser.ConfigParser() config.read(DEVSTACK_GATE_SECURE_CONFIG) jenkins = myjenkins.Jenkins(config.get('jenkins', 'server'), config.get('jenkins', 'user'), config.get('jenkins', 'apikey')) jenkins.get_info() else: jenkins = None print 'Known machines (start):' db.print_state() provider = db.getProvider(PROVIDER_NAME) print "Working with provider %s" % provider.name client = utils.get_client(provider) flavor = utils.get_flavor(client, 1024) print "Found flavor", flavor error = False now = time.time() for machine in provider.machines: # Normally, reap machines that have sat in their current state # for 24 hours, unless that state is READY. if (REAP_ALL_SERVERS or (machine.state != vmdatabase.READY and now - machine.state_time > MACHINE_LIFETIME) or machine.state == vmdatabase.DELETE or machine.state == vmdatabase.ERROR): print 'Deleting machine', machine.name try: delete_machine(jenkins, client, machine) except: error = True traceback.print_exc() provider_min_ready = 0 for base_image in provider.base_images: provider_min_ready += base_image.min_ready for snap_image in base_image.snapshot_images: # Normally, reap images that have sat in their current state # for 24 hours, unless the image is the current snapshot if (REAP_ALL_IMAGES or (snap_image != base_image.current_snapshot and now - snap_image.state_time > MACHINE_LIFETIME)): print 'Deleting image', snap_image.name try: delete_image(client, snap_image) except: error = True traceback.print_exc() print print 'Known machines (end):' db.print_state() utils.update_stats(provider) if error: sys.exit(1)
def main(): db = vmdatabase.VMDatabase() if not SKIP_DEVSTACK_GATE_JENKINS: config = ConfigParser.ConfigParser() config.read(DEVSTACK_GATE_SECURE_CONFIG) jenkins = myjenkins.Jenkins(config.get('jenkins', 'server'), config.get('jenkins', 'user'), config.get('jenkins', 'apikey')) jenkins.get_info() else: jenkins = None provider = db.getProvider(PROVIDER_NAME) print "Working with provider %s" % provider.name client = utils.get_client(provider) last_name = '' error_counts = {} error = False for base_image in provider.base_images: snap_image = base_image.current_snapshot if not snap_image: continue print "Working on image %s" % snap_image.name flavor = utils.get_flavor(client, base_image.min_ram) print "Found flavor", flavor remote_snap_image = client.images.get(snap_image.external_id) print "Found image", remote_snap_image num_to_launch = calculate_deficit(provider, base_image) for i in range(num_to_launch): try: server, machine = launch_node(client, snap_image, remote_snap_image, flavor, last_name) last_name = machine.name except: traceback.print_exc() error = True while True: utils.update_stats(provider) building_machines = provider.building_machines if not building_machines: print "No more machines are building, finished." break print "Waiting on %s machines" % len(building_machines) for machine in building_machines: try: check_machine(jenkins, client, machine, error_counts) except: traceback.print_exc() print "Abandoning machine %s" % machine.id machine.state = vmdatabase.ERROR error = True db.commit() time.sleep(3) if error: sys.exit(1)
def main(): db = vmdatabase.VMDatabase() jenkins = None credentials_id = None if not SKIP_DEVSTACK_GATE_JENKINS: config = ConfigParser.ConfigParser() config.read(DEVSTACK_GATE_SECURE_CONFIG) jenkins = myjenkins.Jenkins(config.get('jenkins', 'server'), config.get('jenkins', 'user'), config.get('jenkins', 'apikey')) jenkins.get_info() if config.has_option('jenkins', 'credentials_id'): credentials_id = config.get('jenkins', 'credentials_id') provider = db.getProvider(PROVIDER_NAME) print "Working with provider %s" % provider.name client = utils.get_client(provider) last_name = '' error_counts = {} error = False for base_image in provider.base_images: snap_image = base_image.current_snapshot if not snap_image: continue print "Working on image %s" % snap_image.name flavor = utils.get_flavor(client, base_image.min_ram) print "Found flavor", flavor remote_snap_image = client.images.get(snap_image.external_id) print "Found image", remote_snap_image num_to_launch = calculate_deficit(provider, base_image) for i in range(num_to_launch): try: server, machine = launch_node(client, snap_image, remote_snap_image, flavor, last_name) last_name = machine.name except: traceback.print_exc() error = True while True: utils.update_stats(provider) building_machines = provider.building_machines if not building_machines: print "No more machines are building, finished." break print "Waiting on %s machines" % len(building_machines) for machine in building_machines: try: check_machine(jenkins, client, machine, error_counts, credentials_id) except: traceback.print_exc() print "Abandoning machine %s" % machine.id utils.log.exception("Abandoning ID: %s" % machine.id) machine.state = vmdatabase.ERROR error = True db.commit() time.sleep(3) if error: sys.exit(1)
def run(opt): train_loader, test_loader = create_loaders(opt) # Initialize generator and discriminator generator = load_or_init_models(RetouchGenerator(opt.device, opt.pw_guide), opt) # Optimizers optimizer_G = torch.optim.Adam(generator.parameters(), lr=opt.lr, weight_decay=1e-8) # Losses # criterion_GAN = torch.nn.MSELoss() # criterion_pixelwise = torch.nn.L1Loss() # if opt.cuda: # generator = generator.cuda() # discriminator = discriminator.cuda() generator, criterion_pixelwise = to_variables((generator,torch.nn.MSELoss()), cuda=opt.cuda, device=opt.device) saverG = ModelSaver(f'{opt.checkpoint_dir}/saved_models/{opt.name}') train_writer = SummaryWriter(log_dir=os.path.join(opt.checkpoint_dir, 'train')) test_writer = SummaryWriter(log_dir=os.path.join(opt.checkpoint_dir, 'test')) for epoch in tqdm(range(opt.epoch, opt.n_epochs), desc='Training'): #### # Train ### avg_stats = defaultdict(float) for i, data in enumerate(train_loader): data = to_variables(data, cuda=opt.cuda, device=opt.device) y_hat, loss_G = trainG(generator, criterion_pixelwise, optimizer_G, data) update_stats(avg_stats, loss_G) # Print image to tensorboard if (epoch % opt.sample_interval == 0) and (i % 50 == 0): train_writer.add_image('RetouchNet', y_hat[0], epoch) train_writer.add_image('Edited', data[2][0], epoch) train_writer.add_image('Original', data[0][0], epoch) # Log Progress str_out = '[train] {}/{} '.format(epoch, opt.n_epochs) for k, v in avg_stats.items(): avg = v / len(train_loader) train_writer.add_scalar(k, avg, epoch) str_out += '{}: {:.6f} '.format(k, avg) print(str_out) #### # Test ### avg_stats = defaultdict(float) images = None with torch.no_grad(): for i, data in enumerate(test_loader): data = to_variables(data, cuda=opt.cuda, device=opt.device, test=True) images, losses = test(generator, criterion_pixelwise, data) update_stats(avg_stats, losses) # Print image to tensorboard if (epoch % opt.sample_interval == 0) and (i % 5 == 0): test_writer.add_image('RetouchNet', images[0], epoch) test_writer.add_image('Edited', data[2][0], epoch) test_writer.add_image('Original', data[0][0], epoch) # Log Progress str_out = '[test] {}/{} '.format(epoch, opt.n_epochs) for k, v in avg_stats.items(): avg = v / len(test_loader) test_writer.add_scalar(k, avg, epoch) str_out += '{}: {:.6f} '.format(k, avg) print(str_out) # If at sample interval save image # if epoch % opt.sample_interval == 0: # x_hr, x_lr, y_hr, y_lr = data # test_writer.add_image('RetouchNet', images[0], epoch) # test_writer.add_image('GroundTruth', y_hr[0], epoch) # test_writer.add_image('raw', x_hr[0], epoch) if epoch % opt.checkpoint_interval == 0: # Save model checkpoints saverG.save_if_best(generator, loss_G['loss_G'])
def forward(self, images, captions, lengths, img_lengths, img_txts, img_spans, txt_spans, labels, ids=None, epoch=None, *args): self.niter += 1 self.logger.update('Eit', self.niter) self.logger.update('lr', self.optimizer.param_groups[0]['lr']) img_lengths = torch.tensor(img_lengths).long() if isinstance( img_lengths, list) else img_lengths lengths = torch.tensor(lengths).long() if isinstance(lengths, list) else lengths if torch.cuda.is_available(): images = images.cuda() captions = captions.cuda() lengths = lengths.cuda() img_lengths = img_lengths.cuda() bsize = captions.size(0) img_emb, nll_img, kl_img, span_margs_img, argmax_spans_img, trees_img, lprobs_img = self.forward_img_parser( images, img_lengths) ll_loss_img = nll_img.sum() kl_loss_img = kl_img.sum() txt_emb, nll_txt, kl_txt, span_margs_txt, argmax_spans_txt, trees_txt, lprobs_txt = self.forward_txt_parser( captions, lengths) ll_loss_txt = nll_txt.sum() kl_loss_txt = kl_txt.sum() contrastive_loss = self.forward_loss(img_emb, txt_emb, img_lengths, lengths, argmax_spans_img, argmax_spans_txt, span_margs_img, span_margs_txt) mt_loss = contrastive_loss.sum() loss_img = self.vse_lm_alpha * (ll_loss_img + kl_loss_img) / bsize loss_txt = self.vse_lm_alpha * (ll_loss_txt + kl_loss_txt) / bsize loss_mt = self.vse_mt_alpha * mt_loss / bsize loss = loss_img + loss_txt + loss_mt self.optimizer.zero_grad() loss.backward() if self.grad_clip > 0: clip_grad_norm_(self.all_params, self.grad_clip) self.optimizer.step() self.logger.update('Loss_img', loss_img.item(), bsize) self.logger.update('Loss_txt', loss_txt.item(), bsize) self.logger.update('KL-Loss_img', kl_loss_img.item() / bsize, bsize) self.logger.update('KL-Loss_txt', kl_loss_txt.item() / bsize, bsize) self.logger.update('LL-Loss_img', ll_loss_img.item() / bsize, bsize) self.logger.update('LL-Loss_txt', ll_loss_txt.item() / bsize, bsize) self.n_word_img += (img_lengths + 1).sum().item() self.n_word_txt += (lengths + 1).sum().item() self.n_sent += bsize for b in range(bsize): max_img_len = img_lengths[b].item() pred_img = [(a[0], a[1]) for a in argmax_spans_img[b] if a[0] != a[1]] pred_set_img = set(pred_img[:-1]) gold_img = [(img_spans[b][i][0].item(), img_spans[b][i][1].item()) for i in range(max_img_len - 1)] gold_set_img = set(gold_img[:-1]) utils.update_stats(pred_set_img, [gold_set_img], self.all_stats_img) max_txt_len = lengths[b].item() pred_txt = [(a[0], a[1]) for a in argmax_spans_txt[b] if a[0] != a[1]] pred_set_txt = set(pred_txt[:-1]) gold_txt = [(txt_spans[b][i][0].item(), txt_spans[b][i][1].item()) for i in range(max_txt_len - 1)] gold_set_txt = set(gold_txt[:-1]) utils.update_stats(pred_set_txt, [gold_set_txt], self.all_stats_txt) # if self.niter % self.log_step == 0: p_norm, g_norm = self.norms() all_f1_img = utils.get_f1(self.all_stats_img) all_f1_txt = utils.get_f1(self.all_stats_txt) train_kl_img = self.logger.meters["KL-Loss_img"].sum train_ll_img = self.logger.meters["LL-Loss_img"].sum train_kl_txt = self.logger.meters["KL-Loss_txt"].sum train_ll_txt = self.logger.meters["LL-Loss_txt"].sum info = '|Pnorm|: {:.6f}, |Gnorm|: {:.2f}, ReconPPL-Img: {:.2f}, KL-Img: {:.2f}, ' + \ 'PPLBound-Img: {:.2f}, CorpusF1-Img: {:.2f}, ' + \ 'ReconPPL-Txt: {:.2f}, KL-Txt: {:.2f}, ' + \ 'PPLBound-Txt: {:.2f}, CorpusF1-Txt: {:.2f}, ' + \ 'Speed: {:.2f} sents/sec' info = info.format( p_norm, g_norm, np.exp(train_ll_img / self.n_word_img), train_kl_img / self.n_sent, np.exp((train_ll_img + train_kl_img) / self.n_word_img), all_f1_img[0], np.exp(train_ll_txt / self.n_word_txt), train_kl_txt / self.n_sent, np.exp((train_ll_txt + train_kl_txt) / self.n_word_txt), all_f1_txt[0], self.n_sent / (time.time() - self.s_time)) pred_action_img = utils.get_actions(trees_img[0]) sent_s_img = img_txts[0] pred_t_img = utils.get_tree(pred_action_img, sent_s_img) gold_t_img = utils.span_to_tree(img_spans[0].tolist(), img_lengths[0].item()) gold_action_img = utils.get_actions(gold_t_img) gold_t_img = utils.get_tree(gold_action_img, sent_s_img) info += "\nPred T Image: {}\nGold T Image: {}".format( pred_t_img, gold_t_img) pred_action_txt = utils.get_actions(trees_txt[0]) sent_s_txt = [ self.vocab.idx2word[wid] for wid in captions[0].cpu().tolist() ] pred_t_txt = utils.get_tree(pred_action_txt, sent_s_txt) gold_t_txt = utils.span_to_tree(txt_spans[0].tolist(), lengths[0].item()) gold_action_txt = utils.get_actions(gold_t_txt) gold_t_txt = utils.get_tree(gold_action_txt, sent_s_txt) info += "\nPred T Text: {}\nGold T Text: {}".format( pred_t_txt, gold_t_txt) return info