def __init__(self, opt, padding_idx4item=0, padding_idx4prefer=0): super().__init__() # self.pad_idx, self.start_idx, self.end_idx) self.batch_size = opt['batch_size'] self.max_length = opt['max_length'] self.dropout = opt['dropout'] self.num_layers = 2 #opt['num_layers'] self.vocab_size = opt['vocab_size'] self.user_size = opt['user_size'] self.dim = opt['dim'] self.embedding_size = opt['embedding_size'] self.pad_idx4item = padding_idx4item self.pad_idx4prefer = padding_idx4prefer self.embeddings = _create_embeddings(self.vocab_size, self.embedding_size, self.pad_idx4item) self.user_embeddings = _create_embeddings(self.user_size, self.embedding_size, self.pad_idx4item) self.position_embeddings = nn.Embedding(opt['max_length'], opt['dim']) self.LayerNorm = LayerNorm(opt['dim'], eps=1e-12) self.dropout = nn.Dropout(opt['dropout']) opt['num_layers'] = 2 self.SAS_encoder = Encoder(opt) self.prefer_SAS_encoder = Encoder(opt) self.neg_SAS_encoder = Encoder(opt) self.item_norm = nn.Linear(opt['dim'], opt['dim']) self.criterion = nn.BCELoss() self.cs_loss = nn.CrossEntropyLoss()
def __init__(self, sentences, context=1, hidden=5, concat=False): logging.info(msg="starting CBOW training..") self.context = context self.encoder = Encoder(sentences=sentences) self.huffman_encoder = HuffmanEncoder(self.encoder.counter) self.encoding_length = self.encoder.encoding_length self.hidden_units = hidden self.output_units = 1 self.input_units = context if concat else 1 self.input2hidden = np.random.rand( self.hidden_units, self.input_units * self.encoding_length) * 0.1 self.hidden2output = np.random.rand( self.output_units * self.encoding_length - 1, self.hidden_units) * 0.1 # train model word_count = 0 last_time = time.time() for sentence in sentences: context_pairs = sentence2contexts(sentence, self.context) for w, c in context_pairs: self._train(w, c) # break word_count += 1 if word_count % 100 == 0: now = time.time() time_spent = 1.0 / (now - last_time) * 100 logging.info(msg="trained on %s words. %s words/sec" % (word_count, time_spent)) last_time = time.time()
def create_encoders(data: List[Tuple[Name, Lang]]) \ -> Tuple[Encoder[Char], Encoder[Lang]]: """Create the encoders for the input characters and the output languages.""" char_enc = Encoder(char for name, lang in data for char in name) lang_enc = Encoder(lang for name, lang in data) return char_enc, lang_enc
def make_model(cnn3d, tgt_vocab, N=6, d_model=512, d_ff=2048, h=8, dropout=0.1): "Helper: Construct a model from hyperparameters." c = copy.deepcopy attn = MultiHeadedAttention(h, d_model) ff = PositionwiseFeedForward(d_model, d_ff, dropout) position = PositionalEncoding(d_model, dropout) model = EncoderDecoder( Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N), Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N), nn.Sequential(c(position)), nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)), Generator(d_model, tgt_vocab), cnn3d) # This was important from their code. # Initialize parameters with Glorot / fan_avg. for p in model.named_parameters(): if not p[0].startswith( "cnn3d") and p[1].requires_grad and p[1].dim() > 1: nn.init.xavier_uniform_(p[1]) return model
def __init__(self, backbone=None, num_classes=21): super(SSD300, self).__init__() self.feature_extractor = backbone self.num_classes = num_classes # number of default bounding boxes in each feature map self.num_defaults = [4, 6, 6, 6, 4, 4] # out_channels = [1024, 512, 512, 256, 256, 256] for resnet50 self._build_additional_features(self.feature_extractor.out_channels) # output of location regression and classification location_extractors = list() confidence_extractors = list() # out_channels = [1024, 512, 512, 256, 256, 256] for resnet50 for nd, oc in zip(self.num_defaults, self.feature_extractor.out_channels): # nd is number_default_boxes, oc is output_channel location_extractors.append( nn.Conv2d(oc, nd * 4, kernel_size=3, padding=1)) confidence_extractors.append( nn.Conv2d(oc, nd * self.num_classes, kernel_size=3, padding=1)) # location regression layers and classification layers self.loc = nn.ModuleList(location_extractors) self.conf = nn.ModuleList(confidence_extractors) self._init_weights() # all default bounding boxes in SSD # shape [8732, 4] default_box = dboxes300() self.compute_loss = Loss(default_box) self.encoder = Encoder(default_box) self.postprocess = PostProcess(default_box)
class InputTransformer: def __init__(self): self.encoder = Encoder() def transform(self, X_train, y_train, augment): X_train = list(X_train) y_train = list(y_train) print('before augmenting', len(X_train)) if augment is not None: X_train, y_train = augment(X_train, y_train) print('after augmetning', len(X_train), len(y_train)) def char_func(char): # word = WordNetLemmatizer().lemmatize(word) return self.encoder.transform(char) + 1 X_train = [ preprocess_chars(ingredients, char_func) for ingredients in X_train ] lengths = numpy.array(list(len(x) for x in X_train)) print(lengths.min(), lengths.mean(), lengths.max(), lengths.std()) X_train = sequence.pad_sequences(X_train, maxlen=600) print("ingredients") print(X_train[:3]) label_transform = LabelBinarizer() y_train = label_transform.fit_transform(y_train) return X_train, y_train
def eval_ssd300_mlperf_coco(args): from coco import COCO # Check that GPUs are actually available use_cuda = not args.no_cuda and torch.cuda.is_available() dboxes = dboxes300_coco() encoder = Encoder(dboxes) val_trans = SSDTransformer(dboxes, (300, 300), val=True) val_annotate = os.path.join(args.data, "annotations/instances_val2017.json") val_coco_root = os.path.join(args.data, "val2017") cocoGt = COCO(annotation_file=val_annotate) val_coco = COCODetection(val_coco_root, val_annotate, val_trans) inv_map = {v:k for k,v in val_coco.label_map.items()} ssd300 = SSD300(val_coco.labelnum) print("loading model checkpoint", args.checkpoint) od = torch.load(args.checkpoint, map_location=lambda storage, loc: storage) ssd300.load_state_dict(od["model"]) if use_cuda: ssd300.cuda(args.device) loss_func = Loss(dboxes) if use_cuda: loss_func.cuda(args.device) coco_eval(ssd300, val_coco, cocoGt, encoder, inv_map, args.threshold,args.device)
class SceneDataset(InMemoryDataset): def __init__(self, root, config, transform=None, pre_transform=None): self.config = config self.attr_encoder = Encoder(config) super().__init__(root, transform, pre_transform) self.data, self.slices = torch.load(self.processed_paths[0]) @property def raw_file_names(self): # return ["graphs.pkl"] return [cmd_args.graph_file_name] @property def processed_file_names(self): # return ['train_1000_dataset.pt'] return [cmd_args.dataset_name] def download(self): pass def process(self): data_list = [] for raw_path in self.raw_paths: with open(raw_path, 'rb') as raw_file: graphs = pickle.load(raw_file) for graph_id, graph in enumerate(graphs): x = self.attr_encoder.get_embedding( [node.name for node in graph.nodes]) edge_index, edge_types = graph.get_edge_info() edge_attrs = torch.tensor( self.attr_encoder.get_embedding( [f"edge_{tp}" for tp in edge_types])) data_point = Data(torch.tensor(x), torch.tensor(edge_index), edge_attrs, graph.target_id) # print(torch.tensor(x), torch.tensor(edge_index), edge_attrs, graph.target_id) data_point.obj_num = len(graph.scene["objects"]) data_point.graph_id = graph_id # data_point.attr_encoder = self.attr_encoder data_list.append(data_point) data, slices = self.collate(data_list) torch.save((data, slices), self.processed_paths[0])
def __init__(self): self.ChunkExp = [] # the list save all chunks, extracted features self.ChunkNegExp = [] self.chunker = Chunker.Chunker( ) # the SVM judgement model to chunk a given sentence self.SRLabler = SRLabeler.SRLabeler( ) # the NBclassification model on a chunk-level sentence. self.encoder = En.Encoder()
def eval_ssd_r34_mlperf_coco(args): from coco import COCO # Check that GPUs are actually available use_cuda = not args.no_cuda and torch.cuda.is_available() dboxes = dboxes_R34_coco(args.image_size, args.strides) encoder = Encoder(dboxes) val_trans = SSDTransformer(dboxes, (args.image_size[0], args.image_size[1]), val=True) if not args.dummy: val_annotate = os.path.join(args.data, "annotations/instances_val2017.json") val_coco_root = os.path.join(args.data, "val2017") cocoGt = COCO(annotation_file=val_annotate) val_coco = COCODetection(val_coco_root, val_annotate, val_trans) inv_map = {v: k for k, v in val_coco.label_map.items()} if args.accuracy_mode: val_dataloader = DataLoader(val_coco, batch_size=args.batch_size, shuffle=False, sampler=None, num_workers=args.workers) else: val_dataloader = DataLoader(val_coco, batch_size=args.batch_size, shuffle=False, sampler=None, num_workers=args.workers, drop_last=True) labelnum = val_coco.labelnum else: cocoGt = None encoder = None inv_map = None val_dataloader = None labelnum = 81 ssd_r34 = SSD_R34(labelnum, strides=args.strides) if args.checkpoint: print("loading model checkpoint", args.checkpoint) od = torch.load(args.checkpoint, map_location=lambda storage, loc: storage) ssd_r34.load_state_dict(od["model"]) if use_cuda: ssd_r34.cuda(args.device) coco_eval(ssd_r34, val_dataloader, cocoGt, encoder, inv_map, args)
def create_encoders( data: List[Tuple[Inp, Out]]) -> Tuple[Encoder[Char], Encoder[POS]]: """Create a pair of encoders, for words and POS tags respectively. Parameters ---------- data : List[Tuple[Inp, Out]] List of input/output pairs based on which the encoders will be created; this parameter should only contain the training pairs, and not development or evaluation pairs. Returns ------- (char_enc, pos_enc) : Tuple[Encoder[Char], Encoder[POS]] Pair of encoders for input characters and output POS tags. """ # Enumerate all input characters present in the dataset # and create the encoder out of the resulting iterable char_enc = Encoder(char for inp, _ in data for word in inp for char in word) # Enumerate all POS tags in the dataset and create # the corresponding encoder pos_enc = Encoder(pos for _, out in data for pos in out) return (char_enc, pos_enc)
def main(): # Parse arguments args = parse_args() # Get categories names with open(args.annotations, 'r') as anno: js = json.loads(anno.read()) coco_names = js['categories'] # Prepare map of COCO labels to COCO names name_map = {} for name in coco_names: name_map[name['id']] = name['name'] # Prepare map of SSD to COCO labels deleted = [12, 26, 29, 30, 45, 66, 68, 69, 71, 83] inv_map = {} cnt = 0 for i in range(1, 81): while i + cnt in deleted: cnt += 1 inv_map[i] = i + cnt # Prepare colors for categories category_id_to_color = dict([ (cat_id, [random.uniform(0, 1), random.uniform(0, 1), random.uniform(0, 1)]) for cat_id in range(1, 91) ]) # Set math plot lib size plt.rcParams["figure.figsize"] = (12, 8) # Build and load SSD model ssd300 = SSD300(81, backbone="resnet34", model_path=None, dilation=None) load_checkpoint(ssd300, args.model) ssd300.eval() # Prepare encoder dboxes = dboxes300_coco() encoder = Encoder(dboxes) # Print images for image in args.images: print_image(image, ssd300, encoder, inv_map, name_map, category_id_to_color, args.threshold)
class AssignGTtoDefaultBox(object): def __init__(self): self.default_box = dboxes300() self.encoder = Encoder(self.default_box) def __call__(self, image, target): # boxes : target bounding boxes in shape [batch, n_objects, 4] # labels : target labels in shape [batch, n_objects] boxes = target['boxes'] labels = target['labels'] # assign ground truth to default bounding boxes # bboxes_out : [batch, 8732, 4] # labels_out : [batch, 8732] bboxes_out, labels_out = self.encoder.encode(boxes, labels) target['boxes'] = bboxes_out target['labels'] = labels_out return image, target
def eval_ssd_r34_mlperf_coco(args): from coco import COCO # Check that GPUs are actually available use_cuda = not args.no_cuda and torch.cuda.is_available() dboxes = dboxes_R34_coco(args.image_size, args.strides) encoder = Encoder(dboxes) val_trans = SSDTransformer(dboxes, (args.image_size[0], args.image_size[1]), val=True) val_annotate = os.path.join(args.data, "annotations/instances_val2017.json") val_coco_root = os.path.join(args.data, "val2017") cocoGt = COCO(annotation_file=val_annotate) val_coco = COCODetection(val_coco_root, val_annotate, val_trans) inv_map = {v: k for k, v in val_coco.label_map.items()} print('ssd r34') ssd_r34 = SSD_R34(val_coco.labelnum, strides=args.strides) print("loading model checkpoint", args.checkpoint) od = torch.load(args.checkpoint, map_location=lambda storage, loc: storage) # import pdb; pdb.set_trace() ssd_r34.load_state_dict(od["model"]) if use_cuda: ssd_r34.cuda(args.device) loss_func = Loss(dboxes) if use_cuda: loss_func.cuda(args.device) if args.onnx: if args.onnx == 'export': return coco_eval_export(ssd_r34, val_coco, cocoGt, encoder, inv_map, args.threshold, args.device, use_cuda) elif args.onnx == 'eval': return coco_eval_onnx(ssd_r34, val_coco, cocoGt, encoder, inv_map, args.threshold, args.device, use_cuda) return coco_eval(ssd_r34, val_coco, cocoGt, encoder, inv_map, args.threshold, args.device, use_cuda)
def __init__(self): self.encoder = Encoder()
def train300_mlperf_coco(args): args.distributed = args.world_size > 1 from coco import COCO # Check that GPUs are actually available use_cuda = not args.no_cuda and torch.cuda.is_available() dboxes = dboxes300_coco() encoder = Encoder(dboxes) train_trans = SSDTransformer(dboxes, (300, 300), val=False) val_trans = SSDTransformer(dboxes, (300, 300), val=True) val_annotate = os.path.join(args.data, "annotations/instances_val2017.json") val_coco_root = os.path.join(args.data, "val2017") train_annotate = os.path.join(args.data, "annotations/instances_train2017.json") train_coco_root = os.path.join(args.data, "train2017") cocoGt = COCO(annotation_file=val_annotate) val_coco = COCODetection(val_coco_root, val_annotate, val_trans) train_coco = COCODetection(train_coco_root, train_annotate, train_trans) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_coco) else: train_sampler = None train_dataloader = DataLoader(train_coco, batch_size=args.batch_size, shuffle=True, num_workers=4, sampler=train_sampler) ssd300 = SSD300(train_coco.labelnum) if args.checkpoint is not None: print("loading model checkpoint", args.checkpoint) od = torch.load(args.checkpoint) ssd300.load_state_dict(od["model"]) ssd300.train() if use_cuda: ssd300.cuda() loss_func = Loss(dboxes) if use_cuda: loss_func.cuda() if args.distributed: dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size) ssd300 = DistributedDataParallel(ssd300) else: ssd300 = torch.nn.DataParallel(ssd300) optim = torch.optim.SGD(ssd300.parameters(), lr=1e-3, momentum=0.9, weight_decay=5e-4) print("epoch", "nbatch", "loss") iter_num = args.iteration avg_loss = 0.0 inv_map = {v: k for k, v in val_coco.label_map.items()} for epoch in range(args.epochs): for nbatch, (img, img_size, bbox, label) in enumerate(train_dataloader): start = time.time() if iter_num == 160000: print("") print("lr decay step #1") for param_group in optim.param_groups: param_group['lr'] = 1e-4 if iter_num == 200000: print("") print("lr decay step #2") for param_group in optim.param_groups: param_group['lr'] = 1e-5 if use_cuda: img = img.cuda() img = Variable(img, requires_grad=True) ploc, plabel = ssd300(img) trans_bbox = bbox.transpose(1, 2).contiguous() if use_cuda: trans_bbox = trans_bbox.cuda() label = label.cuda() gloc, glabel = Variable(trans_bbox, requires_grad=False), \ Variable(label, requires_grad=False) loss = loss_func(ploc, plabel, gloc, glabel) if not np.isinf(loss.item()): avg_loss = 0.999 * avg_loss + 0.001 * loss.item() optim.zero_grad() loss.backward() optim.step() end = time.time() if nbatch % 10 == 0: print("Iteration: {:6d}, Loss function: {:5.3f}, Average Loss: {:.3f}, Average time: {:.3f} secs"\ .format(iter_num, loss.item(), avg_loss, end - start)) if iter_num in args.evaluation: if not args.no_save: print("") print("saving model...") torch.save( { "model": ssd300.state_dict(), "label_map": train_coco.label_info }, "./models/iter_{}.pt".format(iter_num)) if coco_eval(ssd300, val_coco, cocoGt, encoder, inv_map, args.threshold): return iter_num += 1
def val300(path): ssd300 = SSD300(21) dboxes = dboxes300() encoder = Encoder(dboxes) trans = SSDTransformer(dboxes, (300, 300), val=True) valmodel(ssd300, path, dboxes, trans, encoder)
def val300_coco(model_path): print("loading model at {}".format(model_path)) from pycocotools.coco import COCO from pycocotools.cocoeval import COCOeval dboxes = dboxes300_coco() encoder = Encoder(dboxes) trans = SSDTransformer(dboxes, (300, 300), val=True) #annotate = "../../coco_ssd/instances_minival2014.json" #coco_root = "../../coco_data/val2014" #annotate = "../../coco_ssd/image_info_test-dev2015.json" #coco_root = "../../coco_data/test2015" annotate = "../../coco_ssd/instances_val2017.json" coco_root = "../../coco_data/val2017" cocoGt = COCO(annotation_file=annotate) coco = COCODetection(coco_root, annotate, trans) model = SSD300(coco.labelnum) od = torch.load(model_path) model.load_state_dict(od["model"]) model.eval() model.cuda() ret = [] inv_map = {v: k for k, v in coco.label_map.items()} start = time.time() for idx, image_id in enumerate(coco.img_keys): img, (htot, wtot), _, _ = coco[idx] with torch.no_grad(): print("Parsing image: {}/{}".format(idx + 1, len(coco)), end="\r") ploc, plabel = model(img.unsqueeze(0).cuda()) try: result = encoder.decode_batch(ploc, plabel, 0.50, 200)[0] except: #raise print("") print("No object detected in idx: {}".format(idx), end="\r") continue loc, label, prob = [r.cpu().numpy() for r in result] for loc_, label_, prob_ in zip(loc, label, prob): ret.append([image_id, loc_[0]*wtot, \ loc_[1]*htot, (loc_[2] - loc_[0])*wtot, (loc_[3] - loc_[1])*htot, prob_, inv_map[label_]]) print("") print("Predicting Ended, totoal time: {:.2f} s".format(time.time() - start)) cocoDt = cocoGt.loadRes(np.array(ret)) E = COCOeval(cocoGt, cocoDt, iouType='bbox') #E.params.useSegm = 0 #E.params.recThrs = [0.5] #E.params.maxDets = [10, 100, 200] E.evaluate() E.accumulate() E.summarize()
def val512(path): ssd512 = SSD512(21) dboxes = dboxes512() encoder = Encoder(dboxes) trans = SSDTransformer(dboxes, (512, 512), val=True) valmodel(ssd512, path, dboxes, trans, encoder)
def train300_mlperf_coco(args): global torch from coco import COCO # Check that GPUs are actually available use_cuda = not args.no_cuda and torch.cuda.is_available() args.distributed = False if use_cuda: try: from apex.parallel import DistributedDataParallel as DDP if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 except: raise ImportError( "Please install APEX from https://github.com/nvidia/apex") local_seed = args.seed os.environ['USE_CUDA'] = str(use_cuda) if args.world_size > 1: args.distributed = True if args.distributed: # necessary pytorch imports import torch.utils.data.distributed import torch.distributed as dist print('Distributed training with DDP') if args.no_cuda: device = torch.device('cpu') os.environ['RANK'] = str(os.environ.get('PMI_RANK', args.rank)) os.environ['WORLD_SIZE'] = str( os.environ.get('PMI_SIZE', args.world_size)) os.environ['MASTER_ADDR'] = args.master_addr os.environ['MASTER_PORT'] = args.port # Initialize the process group with ccl backend if args.backend == 'ccl': import torch_ccl dist.init_process_group(backend=args.backend) else: torch.cuda.set_device(args.local_rank) device = torch.device('cuda') dist.init_process_group(backend='nccl', init_method='env://') # set seeds properly args.seed = broadcast_seeds(args.seed, device) local_seed = (args.seed + dist.get_rank()) % 2**32 mllogger.event(key=mllog_const.SEED, value=local_seed) # Refer to https://pytorch.org/docs/stable/notes/randomness.html#dataloader torch.manual_seed(local_seed) # Set PyTorch seed np.random.seed(seed=local_seed) # Set Numpy seed random.seed(local_seed) # Set the Python seed args.rank = dist.get_rank() if args.distributed else args.local_rank print("args.rank = {}".format(args.rank)) print("local rank = {}".format(args.local_rank)) print("distributed={}".format(args.distributed)) dboxes = dboxes300_coco() encoder = Encoder(dboxes) input_size = 300 train_trans = SSDTransformer( dboxes, (input_size, input_size), val=False, num_cropping_iterations=args.num_cropping_iterations) val_trans = SSDTransformer(dboxes, (input_size, input_size), val=True) val_annotate = os.path.join(args.data, "annotations/instances_val2017.json") val_coco_root = os.path.join(args.data, "val2017") train_annotate = os.path.join(args.data, "annotations/instances_train2017.json") train_coco_root = os.path.join(args.data, "train2017") cocoGt = COCO(annotation_file=val_annotate) train_coco = COCODetection(train_coco_root, train_annotate, train_trans) val_coco = COCODetection(val_coco_root, val_annotate, val_trans) mllogger.event(key=mllog_const.TRAIN_SAMPLES, value=len(train_coco)) mllogger.event(key=mllog_const.EVAL_SAMPLES, value=len(val_coco)) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_coco) else: train_sampler = None train_dataloader = DataLoader(train_coco, batch_size=args.batch_size, shuffle=(train_sampler is None), sampler=train_sampler, num_workers=0) # set shuffle=True in DataLoader # Leslie: here is the workaround: dist.broadcast will fail on other rank. we will run evalution on all the ranks val_dataloader = DataLoader(val_coco, batch_size=args.val_batch_size or args.batch_size, shuffle=False, sampler=None, num_workers=0) ssd300 = SSD300(train_coco.labelnum, model_path=args.pretrained_backbone) ssd300.train() if use_cuda: ssd300.cuda() loss_func = Loss(dboxes) if use_cuda: loss_func.cuda() if args.distributed: N_gpu = torch.distributed.get_world_size() else: N_gpu = 1 global_batch_size = N_gpu * args.batch_size mllogger.event(key=mllog_const.GLOBAL_BATCH_SIZE, value=global_batch_size) # Reference doesn't support group batch norm, so bn_span==local_batch_size mllogger.event(key=mllog_const.MODEL_BN_SPAN, value=args.batch_size) current_lr = args.lr * (global_batch_size / 32) assert args.batch_size % args.batch_splits == 0, "--batch-size must be divisible by --batch-splits" fragment_size = args.batch_size // args.batch_splits if args.batch_splits != 1: print("using gradient accumulation with fragments of size {}".format( fragment_size)) # Model to NHWC ssd300 = ssd300.to(memory_format=torch.channels_last) current_momentum = 0.9 optim = torch.optim.SGD(ssd300.parameters(), lr=current_lr, momentum=current_momentum, weight_decay=args.weight_decay) ssd_print(key=mllog_const.OPT_BASE_LR, value=current_lr) ssd_print(key=mllog_const.OPT_WEIGHT_DECAY, value=args.weight_decay) iter_num = args.iteration avg_loss = 0.0 inv_map = {v: k for k, v in val_coco.label_map.items()} success = torch.zeros(1) if use_cuda: success = success.cuda() if args.warmup: nonempty_imgs = len(train_coco) wb = int(args.warmup * nonempty_imgs / (N_gpu * args.batch_size)) ssd_print(key=mllog_const.OPT_LR_WARMUP_STEPS, value=wb) warmup_step = lambda iter_num, current_lr: lr_warmup( optim, wb, iter_num, current_lr, args) else: warmup_step = lambda iter_num, current_lr: None ssd_print(key=mllog_const.OPT_LR_WARMUP_FACTOR, value=args.warmup_factor) ssd_print(key=mllog_const.OPT_LR_DECAY_BOUNDARY_EPOCHS, value=args.lr_decay_schedule) mllogger.start(key=mllog_const.BLOCK_START, metadata={ mllog_const.FIRST_EPOCH_NUM: 1, mllog_const.EPOCH_COUNT: args.epochs }) if args.performance_only: train_time = AverageMeter('TrainTime', ':6.3f') progress = ProgressMeter(args.train_iteration, [train_time], prefix='Train: ') # Restore the model and optim from checkpoint if args.checkpoint is not None: print("loading model checkpoint", args.checkpoint) od = torch.load(args.checkpoint) ssd300.load_state_dict(od["model"]) optim.load_state_dict(od['optim']) # Model Prepack if use_ipex: if args.autocast: ssd300, optim = ipex.optimize(ssd300, dtype=torch.bfloat16, optimizer=optim) else: ssd300, optim = ipex.optimize(ssd300, dtype=torch.float32, optimizer=optim) # parallelize if args.distributed: device_ids = None ssd300 = torch.nn.parallel.DistributedDataParallel( ssd300, device_ids=device_ids) optim.zero_grad(set_to_none=True) for epoch in range(args.epochs): mllogger.start(key=mllog_const.EPOCH_START, metadata={mllog_const.EPOCH_NUM: epoch}) # set the epoch for the sampler if args.distributed: train_sampler.set_epoch(epoch) if epoch in args.lr_decay_schedule: current_lr *= 0.1 print("") print("lr decay step #{num}".format( num=args.lr_decay_schedule.index(epoch) + 1)) for param_group in optim.param_groups: param_group['lr'] = current_lr for nbatch, (img, img_id, img_size, bbox, label) in enumerate(train_dataloader): naive_train_case = True # img.shape[0] == fragment_size if naive_train_case: # Naive train case fimg, gloc, glabel, mask, pos_num, neg_num, num_mask = data_preprocess( img, bbox, label, loss_func, args.autocast) if args.performance_only and iter_num >= args.warmup_iterations: start_time = time.time() if args.profile and args.performance_only and iter_num == 30: # Profile Mode with torch.profiler.profile( on_trace_ready=trace_handler) as prof: with torch.cpu.amp.autocast(enabled=args.autocast): ploc, plabel = ssd300(fimg) loss = loss_func(ploc, plabel, gloc, glabel, mask, pos_num, neg_num, num_mask, args.autocast) loss.backward() warmup_step(iter_num, current_lr) optim.step() optim.zero_grad(set_to_none=True) else: # Non Profile Mode with torch.cpu.amp.autocast(enabled=args.autocast): ploc, plabel = ssd300(fimg) loss = loss_func(ploc, plabel, gloc, glabel, mask, pos_num, neg_num, num_mask, args.autocast) loss.backward() warmup_step(iter_num, current_lr) optim.step() optim.zero_grad(set_to_none=True) else: # Train case: when split input to several fragment size print("Not support input with several fragment size yet.") exit(-1) # current_batch_size = img.shape[0] # # Split batch for gradient accumulation # img = torch.split(img, fragment_size) # bbox = torch.split(bbox, fragment_size) # label = torch.split(label, fragment_size) # if args.performance_only and iter_num >= args.warmup_iterations: # start_time=time.time() # for (fimg, fbbox, flabel) in zip(img, bbox, label): # current_fragment_size = fimg.shape[0] # trans_bbox = fbbox.transpose(1,2).contiguous() # if use_cuda: # fimg = fimg.cuda() # trans_bbox = trans_bbox.cuda() # flabel = flabel.cuda() # fimg = Variable(fimg, requires_grad=True) # gloc, glabel = Variable(trans_bbox, requires_grad=False), \ # Variable(flabel, requires_grad=False) # gloc = loss_func._loc_vec(gloc) # mask = glabel > 0 # pos_num = mask.sum(dim=1) # neg_num = torch.clamp(3*pos_num, max=mask.size(1)).unsqueeze(-1) # num_mask = (pos_num > 0).float() # # image to NHWC # fimg = fimg.contiguous(memory_format=torch.channels_last) # if use_ipex: # with ipex.amp.autocast(enabled=args.autocast, configure=ipex.conf.AmpConf(torch.bfloat16)): # ploc, plabel = ssd300(fimg) # loss = loss_func(ploc, plabel, gloc, glabel, mask, pos_num, neg_num, num_mask) # else: # ploc, plabel = ssd300(fimg) # loss = loss_func(ploc, plabel, gloc, glabel, mask, pos_num, neg_num, num_mask) # loss = loss * (current_fragment_size / current_batch_size) # weighted mean # loss.backward() # warmup_step(iter_num, current_lr) # optim.step() # optim.zero_grad(set_to_none=True) if args.performance_only and iter_num >= args.warmup_iterations: train_time.update(time.time() - start_time) if args.performance_only and iter_num % args.print_freq == 0: progress.display(iter_num) if not np.isinf(loss.item()): avg_loss = 0.999 * avg_loss + 0.001 * loss.item() if args.log_interval and not iter_num % args.log_interval: print("Iteration: {:6d}, Loss function: {:5.8f}, Average Loss: {:.8f}"\ .format(iter_num, loss.item(), avg_loss)) iter_num += 1 if args.performance_only and iter_num >= args.train_iteration: break if args.performance_only and iter_num >= args.train_iteration: break if (args.val_epochs and (epoch+1) in args.val_epochs) or \ (args.val_interval and not (epoch+1) % args.val_interval): if args.distributed: world_size = float(dist.get_world_size()) for bn_name, bn_buf in ssd300.module.named_buffers( recurse=True): if ('running_mean' in bn_name) or ('running_var' in bn_name): dist.all_reduce(bn_buf, op=dist.ReduceOp.SUM) bn_buf /= world_size ssd_print(key=mllog_const.MODEL_BN_SPAN, value=bn_buf.cpu().detach().numpy()) if args.rank == 0 or True: # Leslie: here is the workaround: dist.broadcast will fail on other rank. we will run evalution on all the ranks if not args.no_save: print("") print("saving model...") torch.save( { "model": ssd300.state_dict(), "label_map": train_coco.label_info, "optim": optim.state_dict() }, "./models/iter_{}.pt".format(iter_num)) if coco_eval(ssd300, val_dataloader, cocoGt, encoder, inv_map, args.threshold, epoch + 1, iter_num, log_interval=args.log_interval, nms_valid_thresh=args.nms_valid_thresh, use_autocast=args.autocast): success = torch.ones(1) if use_cuda: success = success.cuda() # Leslie: same Workaround: since we run evalution on all ranks, we don't need to broadcast the evalutation result # if args.distributed: # dist.broadcast(success, 0) if success[0]: return True mllogger.end(key=mllog_const.EPOCH_STOP, metadata={mllog_const.EPOCH_NUM: epoch}) mllogger.end(key=mllog_const.BLOCK_STOP, metadata={ mllog_const.FIRST_EPOCH_NUM: 1, mllog_const.EPOCH_COUNT: args.epochs }) if args.performance_only: batch_size = args.batch_size latency = train_time.avg / batch_size * 1000 perf = batch_size / train_time.avg print('train latency %.2f ms' % latency) print('train performance %.2f fps' % perf) print("Throughput: {:.3f} fps".format(perf)) return False
def __init__(self, root, config, transform=None, pre_transform=None): self.config = config self.attr_encoder = Encoder(config) super().__init__(root, transform, pre_transform) self.data, self.slices = torch.load(self.processed_paths[0])
def train_mlperf_coco(args): from coco import COCO # Check that GPUs are actually available use_cuda = not args.no_cuda and torch.cuda.is_available() ssd_r34 = SSD_R34(81, strides=args.strides) #img_size=[args.image_size,args.image_size] dboxes = dboxes_coco(args.image_size, args.strides) encoder = Encoder(dboxes) train_trans = SSDTransformer(dboxes, tuple(args.image_size), val=False) val_trans = SSDTransformer(dboxes, tuple(args.image_size), val=True) val_annotate = os.path.join(args.data, "annotations/instances_val2017.json") val_coco_root = os.path.join(args.data, "val2017") train_annotate = os.path.join(args.data, "annotations/instances_train2017.json") train_coco_root = os.path.join(args.data, "train2017") cocoGt = COCO(annotation_file=val_annotate) val_coco = COCODetection(val_coco_root, val_annotate, val_trans) train_coco = COCODetection(train_coco_root, train_annotate, train_trans) #print("Number of labels: {}".format(train_coco.labelnum)) train_dataloader = DataLoader(train_coco, batch_size=args.batch_size, shuffle=True, num_workers=4) ssd_r34 = SSD_R34(train_coco.labelnum, strides=args.strides) if args.checkpoint is not None: print("loading model checkpoint", args.checkpoint) od = torch.load(args.checkpoint) ssd_r34.load_state_dict(od["model"]) ssd_r34.train() ssd_r34.to('cuda') if use_cuda: if args.device_ids and len(args.device_ids) > 1: ssd_r34 = nn.DataParallel(ssd_r34, args.device_ids) loss_func = Loss(dboxes) if use_cuda: loss_func.to('cuda') loss_func = nn.DataParallel(loss_func, args.device_ids) optim = torch.optim.SGD(ssd_r34.parameters(), lr=1e-3, momentum=0.9, weight_decay=5e-4) print("epoch", "nbatch", "loss") iter_num = args.iteration avg_loss = 0.0 last_loss = [0.0] * 10 inv_map = {v: k for k, v in val_coco.label_map.items()} for epoch in range(args.epochs): for nbatch, (img, img_size, bbox, label) in enumerate(train_dataloader): if iter_num == 160000: print("") print("lr decay step #1") for param_group in optim.param_groups: param_group['lr'] = 1e-4 if iter_num == 200000: print("") print("lr decay step #2") for param_group in optim.param_groups: param_group['lr'] = 1e-5 img = Variable(img, requires_grad=True) ploc, plabel, _ = ssd_r34(img.to('cuda')) trans_bbox = bbox.transpose(1, 2).contiguous() gloc, glabel = Variable(trans_bbox, requires_grad=False), \ Variable(label, requires_grad=False) loss = loss_func(ploc, plabel, gloc, glabel).mean() if not np.isinf(loss.item()): avg_loss = 0.999 * avg_loss + 0.001 * loss.item() last_loss.pop() last_loss = [loss.item()] + last_loss avg_last_loss = sum(last_loss) / len(last_loss) print("Iteration: {:6d}, Loss function: {:5.3f}, Average Loss: {:.3f}, Average Last 10 Loss: {:.3f}"\ .format(iter_num, loss.item(), avg_loss,avg_last_loss), end="\r") optim.zero_grad() loss.backward() optim.step() loss = None if iter_num in args.evaluation: if not args.no_save: print("") print("saving model...") module = ssd_r34.module if len( args.device_ids) > 1 else ssd_r34 torch.save( { "model": module.state_dict(), "label_map": train_coco.label_info }, args.save_path + "/iter_{}.pt".format(iter_num)) if coco_eval(ssd_r34, val_coco, cocoGt, encoder, inv_map, args.threshold, args.device_ids): return iter_num += 1
def train300_mlperf_coco(args): from coco import COCO # Check that GPUs are actually available use_cuda = not args.no_cuda and torch.cuda.is_available() dboxes = dboxes300_coco() encoder = Encoder(dboxes) input_size = 300 train_trans = SSDTransformer(dboxes, (input_size, input_size), val=False) val_trans = SSDTransformer(dboxes, (input_size, input_size), val=True) mlperf_log.ssd_print(key=mlperf_log.INPUT_SIZE, value=input_size) val_annotate = os.path.join(args.data, "annotations/instances_val2017.json") val_coco_root = os.path.join(args.data, "val2017") train_annotate = os.path.join(args.data, "annotations/instances_train2017.json") train_coco_root = os.path.join(args.data, "train2017") cocoGt = COCO(annotation_file=val_annotate) val_coco = COCODetection(val_coco_root, val_annotate, val_trans) train_coco = COCODetection(train_coco_root, train_annotate, train_trans) #print("Number of labels: {}".format(train_coco.labelnum)) train_dataloader = DataLoader(train_coco, batch_size=args.batch_size, shuffle=True, num_workers=4) # set shuffle=True in DataLoader mlperf_log.ssd_print(key=mlperf_log.INPUT_SHARD, value=None) mlperf_log.ssd_print(key=mlperf_log.INPUT_ORDER) mlperf_log.ssd_print(key=mlperf_log.INPUT_BATCH_SIZE, value=args.batch_size) ssd300 = SSD300(train_coco.labelnum) if args.checkpoint is not None: print("loading model checkpoint", args.checkpoint) od = torch.load(args.checkpoint) ssd300.load_state_dict(od["model"]) ssd300.train() if use_cuda: ssd300.cuda() loss_func = Loss(dboxes) if use_cuda: loss_func.cuda() current_lr = 1e-3 current_momentum = 0.9 current_weight_decay = 5e-4 optim = torch.optim.SGD(ssd300.parameters(), lr=current_lr, momentum=current_momentum, weight_decay=current_weight_decay) mlperf_log.ssd_print(key=mlperf_log.OPT_NAME, value="SGD") mlperf_log.ssd_print(key=mlperf_log.OPT_LR, value=current_lr) mlperf_log.ssd_print(key=mlperf_log.OPT_MOMENTUM, value=current_momentum) mlperf_log.ssd_print(key=mlperf_log.OPT_WEIGHT_DECAY, value=current_weight_decay) print("epoch", "nbatch", "loss") iter_num = args.iteration avg_loss = 0.0 inv_map = {v: k for k, v in val_coco.label_map.items()} mlperf_log.ssd_print(key=mlperf_log.TRAIN_LOOP) for epoch in range(args.epochs): mlperf_log.ssd_print(key=mlperf_log.TRAIN_EPOCH, value=epoch) for nbatch, (img, img_size, bbox, label) in enumerate(train_dataloader): if iter_num == 160000: current_lr = 1e-4 print("") print("lr decay step #1") for param_group in optim.param_groups: param_group['lr'] = current_lr mlperf_log.ssd_print(key=mlperf_log.OPT_LR, value=current_lr) if iter_num == 200000: current_lr = 1e-5 print("") print("lr decay step #2") for param_group in optim.param_groups: param_group['lr'] = current_lr mlperf_log.ssd_print(key=mlperf_log.OPT_LR, value=current_lr) if use_cuda: img = img.cuda() img = Variable(img, requires_grad=True) ploc, plabel = ssd300(img) trans_bbox = bbox.transpose(1, 2).contiguous() if use_cuda: trans_bbox = trans_bbox.cuda() label = label.cuda() gloc, glabel = Variable(trans_bbox, requires_grad=False), \ Variable(label, requires_grad=False) loss = loss_func(ploc, plabel, gloc, glabel) if not np.isinf(loss.item()): avg_loss = 0.999 * avg_loss + 0.001 * loss.item() print("Iteration: {:6d}, Loss function: {:5.3f}, Average Loss: {:.3f}"\ .format(iter_num, loss.item(), avg_loss), end="\r") optim.zero_grad() loss.backward() optim.step() if iter_num in args.evaluation: if not args.no_save: print("") print("saving model...") torch.save( { "model": ssd300.state_dict(), "label_map": train_coco.label_info }, "./models/iter_{}.pt".format(iter_num)) if coco_eval(ssd300, val_coco, cocoGt, encoder, inv_map, args.threshold, epoch, iter_num): return True iter_num += 1 return False
def create_encoders(data: List[Tuple[Inp, Out]]) \ -> Tuple[Encoder[Word], Encoder[POS]]: """Create a pair of encoders, for words and POS tags respectively.""" word_enc = Encoder(word for inp, _ in data for word in inp) pos_enc = Encoder(pos for _, out in data for pos in out) return (word_enc, pos_enc)
data_dir = os.path.abspath(__file__ + "../../../../data") raw_path = os.path.abspath(os.path.join(data_dir, "./processed_dataset/raw")) scenes_path = os.path.abspath(os.path.join(raw_path, cmd_args.scene_file_name)) graphs_path = os.path.join(raw_path, cmd_args.graph_file_name) # In the pytorch geometry package, only int and tensor seems to be allowed to save # we process all the graphs and save them to a file. with open(scenes_path, 'r') as scenes_file: scenes = json.load(scenes_file) config = get_config() graphs = [] attr_encoder = Encoder(config) for scene in scenes: for target_id in range(len(scene["objects"])): graph = Graph(config, scene, target_id) graphs.append(graph) with open(graphs_path, 'wb') as graphs_file: pickle.dump(graphs, graphs_file) root = os.path.join(data_dir, "./processed_dataset") scene_dataset = SceneDataset(root, config) if os.path.exists(cmd_args.model_path) and os.path.getsize(cmd_args.model_path) > 0: refrl = torch.load(cmd_args.model_path) logging.info("Loaded refrl model")
return True # not possible if not self.possible: return True return False if __name__ == "__main__": # load the data data_dir = os.path.abspath(__file__ + "../../../data") root = os.path.abspath(os.path.join(data_dir, "./processed_dataset")) config = get_config() attr_encoder = Encoder(config) scenes_path = os.path.abspath( os.path.join(data_dir, f"./processed_dataset/raw/{cmd_args.scene_file_name}")) with open(scenes_path, 'r') as scenes_file: scenes = json.load(scenes_file) # construct a mini example target_id = 0 graph = Graph(config, scenes[0], target_id) x = attr_encoder.get_embedding([node.name for node in graph.nodes]) edge_index, edge_types = graph.get_edge_info() edge_attrs = torch.tensor(attr_encoder.get_embedding(edge_types)) data_point = Data(x=x,
def train300_mlperf_coco(exp, args): from coco import COCO device = exp.get_device() chrono = exp.chrono() dboxes = dboxes300_coco() encoder = Encoder(dboxes) input_size = 300 train_trans = SSDTransformer(dboxes, (input_size, input_size), val=False) val_trans = SSDTransformer(dboxes, (input_size, input_size), val=True) # mlperf_log.ssd_print(key=# mlperf_log.INPUT_SIZE, value=input_size) val_annotate = os.path.join(args.data, "annotations/instances_val2017.json") val_coco_root = os.path.join(args.data, "val2017") train_annotate = os.path.join(args.data, "annotations/instances_train2017.json") train_coco_root = os.path.join(args.data, "train2017") cocoGt = COCO(annotation_file=val_annotate) val_coco = COCODetection(val_coco_root, val_annotate, val_trans) train_coco = COCODetection(train_coco_root, train_annotate, train_trans) #print("Number of labels: {}".format(train_coco.labelnum)) train_dataloader = DataLoader(train_coco, batch_size=args.batch_size, shuffle=True, num_workers=4) # set shuffle=True in DataLoader # mlperf_log.ssd_print(key=# mlperf_log.INPUT_SHARD, value=None) # mlperf_log.ssd_print(key=# mlperf_log.INPUT_ORDER) # mlperf_log.ssd_print(key=# mlperf_log.INPUT_BATCH_SIZE, value=args.batch_size) ssd300 = SSD300(train_coco.labelnum) if args.checkpoint is not None: print("loading model checkpoint", args.checkpoint) od = torch.load(args.checkpoint) ssd300.load_state_dict(od["model"]) ssd300.train() ssd300 = ssd300.to(device) loss_func = Loss(dboxes).to(device) current_lr = 1e-3 current_momentum = 0.9 current_weight_decay = 5e-4 optim = torch.optim.SGD(ssd300.parameters(), lr=current_lr, momentum=current_momentum, weight_decay=current_weight_decay) # mlperf_log.ssd_print(key=# mlperf_log.OPT_NAME, value="SGD") # mlperf_log.ssd_print(key=# mlperf_log.OPT_LR, value=current_lr) # mlperf_log.ssd_print(key=# mlperf_log.OPT_MOMENTUM, value=current_momentum) # mlperf_log.ssd_print(key=# mlperf_log.OPT_WEIGHT_DECAY, value=current_weight_decay) print("epoch", "nbatch", "loss") iter_num = args.iteration avg_loss = 0.0 inv_map = {v: k for k, v in val_coco.label_map.items()} # mlperf_log.ssd_print(key=# mlperf_log.TRAIN_LOOP) for epoch in range(args.repeat): # mlperf_log.ssd_print(key=# mlperf_log.TRAIN_EPOCH, value=epoch) with chrono.time('train') as t: for nbatch, (img, img_size, bbox, label) in enumerate(train_dataloader): if nbatch > args.number: break img = Variable(img.to(device), requires_grad=True) ploc, plabel = ssd300(img) trans_bbox = bbox.transpose(1, 2).contiguous() trans_bbox = trans_bbox.to(device) label = label.to(device) gloc = Variable(trans_bbox, requires_grad=False) glabel = Variable(label, requires_grad=False) loss = loss_func(ploc, plabel, gloc, glabel) if not np.isinf(loss.item()): avg_loss = 0.999 * avg_loss + 0.001 * loss.item() exp.log_batch_loss(loss) optim.zero_grad() loss.backward() optim.step() iter_num += 1 exp.show_eta(epoch, t) exp.report() return False
def train300_mlperf_coco(args): from pycocotools.coco import COCO # Check that GPUs are actually available use_cuda = not args.no_cuda # Setup multi-GPU if necessary args.distributed = False if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') local_seed = set_seeds(args) # start timing here if args.distributed: N_gpu = torch.distributed.get_world_size() else: N_gpu = 1 validate_group_bn(args.bn_group) # Setup data, defaults dboxes = dboxes300_coco() encoder = Encoder(dboxes) input_size = 300 val_trans = SSDTransformer(dboxes, (input_size, input_size), val=True) val_annotate = os.path.join(args.data, "annotations/instances_val2017.json") val_coco_root = os.path.join(args.data, "val2017") train_annotate = os.path.join(args.data, "annotations/instances_train2017.json") train_coco_root = os.path.join(args.data, "train2017") # Build the model model_options = { 'backbone': args.backbone, 'use_nhwc': args.nhwc, 'pad_input': args.pad_input, 'bn_group': args.bn_group, } ssd300 = SSD300(args.num_classes, **model_options) if args.checkpoint is not None: load_checkpoint(ssd300, args.checkpoint) ssd300.train() ssd300.cuda() if args.opt_loss: loss_func = OptLoss(dboxes) else: loss_func = Loss(dboxes) loss_func.cuda() if args.distributed: N_gpu = torch.distributed.get_world_size() else: N_gpu = 1 if args.use_fp16: ssd300 = network_to_half(ssd300) # Parallelize. Need to do this after network_to_half. if args.distributed: if args.delay_allreduce: print_message(args.local_rank, "Delaying allreduces to the end of backward()") ssd300 = DDP(ssd300, gradient_predivide_factor=N_gpu / 8.0, delay_allreduce=args.delay_allreduce, retain_allreduce_buffers=args.use_fp16) # Create optimizer. This must also be done after network_to_half. global_batch_size = (N_gpu * args.batch_size) mlperf_print(key=mlperf_compliance.constants.MODEL_BN_SPAN, value=args.bn_group * args.batch_size) mlperf_print(key=mlperf_compliance.constants.GLOBAL_BATCH_SIZE, value=global_batch_size) # mlperf only allows base_lr scaled by an integer base_lr = 2.5e-3 requested_lr_multiplier = args.lr / base_lr adjusted_multiplier = max( 1, round(requested_lr_multiplier * global_batch_size / 32)) current_lr = base_lr * adjusted_multiplier current_momentum = 0.9 current_weight_decay = args.wd static_loss_scale = 128. if args.use_fp16: if args.distributed and not args.delay_allreduce: # We can't create the flat master params yet, because we need to # imitate the flattened bucket structure that DDP produces. optimizer_created = False else: model_buckets = [ [ p for p in ssd300.parameters() if p.requires_grad and p.type() == "torch.cuda.HalfTensor" ], [ p for p in ssd300.parameters() if p.requires_grad and p.type() == "torch.cuda.FloatTensor" ] ] flat_master_buckets = create_flat_master(model_buckets) optim = torch.optim.SGD(flat_master_buckets, lr=current_lr, momentum=current_momentum, weight_decay=current_weight_decay) optimizer_created = True else: optim = torch.optim.SGD(ssd300.parameters(), lr=current_lr, momentum=current_momentum, weight_decay=current_weight_decay) optimizer_created = True mlperf_print(key=mlperf_compliance.constants.OPT_BASE_LR, value=current_lr) mlperf_print(key=mlperf_compliance.constants.OPT_WEIGHT_DECAY, value=current_weight_decay) if args.warmup is not None: mlperf_print(key=mlperf_compliance.constants.OPT_LR_WARMUP_STEPS, value=args.warmup) mlperf_print(key=mlperf_compliance.constants.OPT_LR_WARMUP_FACTOR, value=args.warmup_factor) # Model is completely finished -- need to create separate copies, preserve parameters across # them, and jit ssd300_eval = SSD300(args.num_classes, backbone=args.backbone, use_nhwc=args.nhwc, pad_input=args.pad_input).cuda() if args.use_fp16: ssd300_eval = network_to_half(ssd300_eval) # Get the existant state from the train model # * if we use distributed, then we want .module train_model = ssd300.module if args.distributed else ssd300 ssd300_eval.load_state_dict(train_model.state_dict()) ssd300_eval.eval() print_message(args.local_rank, "epoch", "nbatch", "loss") eval_points = np.array(args.evaluation) * 32 / global_batch_size eval_points = list(map(int, list(eval_points))) iter_num = args.iteration avg_loss = 0.0 start_elapsed_time = time.time() last_printed_iter = args.iteration num_elapsed_samples = 0 # Generate normalization tensors mean, std = generate_mean_std(args) dummy_overflow_buf = torch.cuda.IntTensor([0]) def step_maybe_fp16_maybe_distributed(optim): if args.use_fp16: if args.distributed: for flat_master, allreduce_buffer in zip( flat_master_buckets, ssd300.allreduce_buffers): if allreduce_buffer is None: raise RuntimeError("allreduce_buffer is None") flat_master.grad = allreduce_buffer.float() flat_master.grad.data.mul_(1. / static_loss_scale) else: for flat_master, model_bucket in zip(flat_master_buckets, model_buckets): flat_grad = apex_C.flatten( [m.grad.data for m in model_bucket]) flat_master.grad = flat_grad.float() flat_master.grad.data.mul_(1. / static_loss_scale) optim.step() if args.use_fp16: # Use multi-tensor scale instead of loop & individual parameter copies for model_bucket, flat_master in zip(model_buckets, flat_master_buckets): multi_tensor_applier( amp_C.multi_tensor_scale, dummy_overflow_buf, [ apex_C.unflatten(flat_master.data, model_bucket), model_bucket ], 1.0) input_c = 4 if args.pad_input else 3 example_shape = [args.batch_size, 300, 300, input_c ] if args.nhwc else [args.batch_size, input_c, 300, 300] example_input = torch.randn(*example_shape).cuda() if args.use_fp16: example_input = example_input.half() if args.jit: # DDP has some Python-side control flow. If we JIT the entire DDP-wrapped module, # the resulting ScriptModule will elide this control flow, resulting in allreduce # hooks not being called. If we're running distributed, we need to extract and JIT # the wrapped .module. # Replacing a DDP-ed ssd300 with a script_module might also cause the AccumulateGrad hooks # to go out of scope, and therefore silently disappear. module_to_jit = ssd300.module if args.distributed else ssd300 if args.distributed: ssd300.module = torch.jit.trace(module_to_jit, example_input) else: ssd300 = torch.jit.trace(module_to_jit, example_input) # JIT the eval model too ssd300_eval = torch.jit.trace(ssd300_eval, example_input) # do a dummy fprop & bprop to make sure cudnnFind etc. are timed here ploc, plabel = ssd300(example_input) # produce a single dummy "loss" to make things easier loss = ploc[0, 0, 0] + plabel[0, 0, 0] dloss = torch.randn_like(loss) # Cause cudnnFind for dgrad, wgrad to run loss.backward(dloss) mlperf_print(key=mlperf_compliance.constants.INIT_STOP, sync=True) ##### END INIT # This is the first place we touch anything related to data ##### START DATA TOUCHING mlperf_print(key=mlperf_compliance.constants.RUN_START, sync=True) barrier() cocoGt = COCO(annotation_file=val_annotate, use_ext=True) val_coco = COCODetection(val_coco_root, val_annotate, val_trans) if args.distributed: val_sampler = GeneralDistributedSampler(val_coco, pad=False) else: val_sampler = None if args.no_dali: train_trans = SSDTransformer(dboxes, (input_size, input_size), val=False) train_coco = COCODetection(train_coco_root, train_annotate, train_trans) if args.distributed: train_sampler = GeneralDistributedSampler(train_coco, pad=False) else: train_sampler = None train_loader = DataLoader(train_coco, batch_size=args.batch_size * args.input_batch_multiplier, shuffle=(train_sampler is None), sampler=train_sampler, num_workers=args.num_workers, collate_fn=partial(my_collate, is_training=True)) else: train_pipe = COCOPipeline(args.batch_size * args.input_batch_multiplier, args.local_rank, train_coco_root, train_annotate, N_gpu, num_threads=args.num_workers, output_fp16=args.use_fp16, output_nhwc=args.nhwc, pad_output=args.pad_input, seed=local_seed - 2**31, use_nvjpeg=args.use_nvjpeg, use_roi=args.use_roi_decode, dali_cache=args.dali_cache, dali_async=(not args.dali_sync)) print_message(args.local_rank, "time_check a: {secs:.9f}".format(secs=time.time())) train_pipe.build() print_message(args.local_rank, "time_check b: {secs:.9f}".format(secs=time.time())) test_run = train_pipe.run() train_loader = SingleDaliIterator( train_pipe, [ 'images', DALIOutput('bboxes', False, True), DALIOutput('labels', True, True) ], train_pipe.epoch_size()['train_reader'], ngpu=N_gpu) train_loader = EncodingInputIterator(train_loader, dboxes=encoder.dboxes.cuda(), nhwc=args.nhwc, fake_input=args.fake_input, no_dali=args.no_dali) if args.input_batch_multiplier > 1: train_loader = RateMatcher(input_it=train_loader, output_size=args.batch_size) val_dataloader = DataLoader( val_coco, batch_size=args.eval_batch_size, shuffle=False, # Note: distributed sampler is shuffled :( sampler=val_sampler, num_workers=args.num_workers) inv_map = {v: k for k, v in val_coco.label_map.items()} ##### END DATA TOUCHING i_eval = 0 first_epoch = 1 mlperf_print(key=mlperf_compliance.constants.BLOCK_START, metadata={ 'first_epoch_num': first_epoch, 'epoch_count': args.evaluation[i_eval] * 32 / train_pipe.epoch_size()['train_reader'] }, sync=True) for epoch in range(args.epochs): mlperf_print(key=mlperf_compliance.constants.EPOCH_START, metadata={'epoch_num': epoch + 1}, sync=True) for p in ssd300.parameters(): p.grad = None for i, (img, bbox, label) in enumerate(train_loader): if args.profile_start is not None and iter_num == args.profile_start: torch.cuda.profiler.start() torch.cuda.synchronize() if args.profile_nvtx: torch.autograd._enable_profiler( torch.autograd.ProfilerState.NVTX) if args.profile is not None and iter_num == args.profile: if args.profile_start is not None and iter_num >= args.profile_start: # we turned cuda and nvtx profiling on, better turn it off too if args.profile_nvtx: torch.autograd._disable_profiler() torch.cuda.profiler.stop() return if args.warmup is not None and optimizer_created: lr_warmup(optim, args.warmup, iter_num, epoch, current_lr, args) if iter_num == ((args.decay1 * 1000 * 32) // global_batch_size): print_message(args.local_rank, "lr decay step #1") current_lr *= 0.1 for param_group in optim.param_groups: param_group['lr'] = current_lr if iter_num == ((args.decay2 * 1000 * 32) // global_batch_size): print_message(args.local_rank, "lr decay step #2") current_lr *= 0.1 for param_group in optim.param_groups: param_group['lr'] = current_lr if (img is None) or (bbox is None) or (label is None): print("No labels in batch") continue ploc, plabel = ssd300(img) ploc, plabel = ploc.float(), plabel.float() N = img.shape[0] gloc, glabel = Variable(bbox, requires_grad=False), \ Variable(label, requires_grad=False) loss = loss_func(ploc, plabel, gloc, glabel) if np.isfinite(loss.item()): avg_loss = 0.999 * avg_loss + 0.001 * loss.item() else: print("model exploded (corrupted by Inf or Nan)") sys.exit() num_elapsed_samples += N if args.local_rank == 0 and iter_num % args.print_interval == 0: end_elapsed_time = time.time() elapsed_time = end_elapsed_time - start_elapsed_time avg_samples_per_sec = num_elapsed_samples * N_gpu / elapsed_time print("Iteration: {:6d}, Loss function: {:5.3f}, Average Loss: {:.3f}, avg. samples / sec: {:.2f}"\ .format(iter_num, loss.item(), avg_loss, avg_samples_per_sec), end="\n") last_printed_iter = iter_num start_elapsed_time = time.time() num_elapsed_samples = 0 # loss scaling if args.use_fp16: loss = loss * static_loss_scale loss.backward() if not optimizer_created: # Imitate the model bucket structure created by DDP. # These will already be split by type (float or half). model_buckets = [] for bucket in ssd300.active_i_buckets: model_buckets.append([]) for active_i in bucket: model_buckets[-1].append( ssd300.active_params[active_i]) flat_master_buckets = create_flat_master(model_buckets) optim = torch.optim.SGD(flat_master_buckets, lr=current_lr, momentum=current_momentum, weight_decay=current_weight_decay) optimizer_created = True # Skip this first iteration because flattened allreduce buffers are not yet created. # step_maybe_fp16_maybe_distributed(optim) else: step_maybe_fp16_maybe_distributed(optim) # Likely a decent skew here, let's take this opportunity to set the gradients to None. # After DALI integration, playing with the placement of this is worth trying. for p in ssd300.parameters(): p.grad = None if iter_num in eval_points: # Get the existant state from the train model # * if we use distributed, then we want .module train_model = ssd300.module if args.distributed else ssd300 if args.distributed and args.allreduce_running_stats: if get_rank() == 0: print("averaging bn running means and vars") # make sure every node has the same running bn stats before # using them to evaluate, or saving the model for inference world_size = float(torch.distributed.get_world_size()) for bn_name, bn_buf in train_model.named_buffers( recurse=True): if ('running_mean' in bn_name) or ('running_var' in bn_name): torch.distributed.all_reduce(bn_buf, op=dist.ReduceOp.SUM) bn_buf /= world_size if get_rank() == 0: if not args.no_save: print("saving model...") torch.save( { "model": ssd300.state_dict(), "label_map": val_coco.label_info }, "./models/iter_{}.pt".format(iter_num)) ssd300_eval.load_state_dict(train_model.state_dict()) succ = coco_eval( ssd300_eval, val_dataloader, cocoGt, encoder, inv_map, args.threshold, epoch, iter_num, args.eval_batch_size, use_fp16=args.use_fp16, local_rank=args.local_rank if args.distributed else -1, N_gpu=N_gpu, use_nhwc=args.nhwc, pad_input=args.pad_input) mlperf_print(key=mlperf_compliance.constants.BLOCK_STOP, metadata={'first_epoch_num': first_epoch}, sync=True) if succ: return True if iter_num != max(eval_points): i_eval += 1 first_epoch = epoch + 1 mlperf_print(key=mlperf_compliance.constants.BLOCK_START, metadata={ 'first_epoch_num': first_epoch, 'epoch_count': (args.evaluation[i_eval] - args.evaluation[i_eval - 1]) * 32 / train_pipe.epoch_size()['train_reader'] }, sync=True) iter_num += 1 if args.max_iter > 0: if iter_num > args.max_iter: break train_loader.reset() mlperf_print(key=mlperf_compliance.constants.EPOCH_STOP, metadata={'epoch_num': epoch + 1}, sync=True) return False
def test_encoder(self): encoder = Encoder() self.assertEqual(encoder.transform("a"), 0) self.assertEqual(encoder.transform("b"), 1) self.assertEqual(encoder.transform("a"), 0)
def train300_mlperf_coco(args): global torch from coco import COCO # Check that GPUs are actually available use_cuda = not args.no_cuda and torch.cuda.is_available() args.distributed = False if use_cuda: try: from apex.parallel import DistributedDataParallel as DDP if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 except: raise ImportError( "Please install APEX from https://github.com/nvidia/apex") if args.distributed: # necessary pytorch imports import torch.utils.data.distributed import torch.distributed as dist # ssd_print(key=mlperf_log.RUN_SET_RANDOM_SEED) if args.no_cuda: device = torch.device('cpu') else: torch.cuda.set_device(args.local_rank) device = torch.device('cuda') dist.init_process_group(backend='nccl', init_method='env://') # set seeds properly args.seed = broadcast_seeds(args.seed, device) local_seed = (args.seed + dist.get_rank()) % 2**32 print(dist.get_rank(), "Using seed = {}".format(local_seed)) torch.manual_seed(local_seed) np.random.seed(seed=local_seed) dboxes = dboxes300_coco() encoder = Encoder(dboxes) input_size = 300 train_trans = SSDTransformer(dboxes, (input_size, input_size), val=False) val_trans = SSDTransformer(dboxes, (input_size, input_size), val=True) ssd_print(key=mlperf_log.INPUT_SIZE, value=input_size) val_annotate = os.path.join(args.data, "annotations/instances_val2017.json") val_coco_root = os.path.join(args.data, "val2017") train_annotate = os.path.join(args.data, "annotations/instances_train2017.json") train_coco_root = os.path.join(args.data, "train2017") cocoGt = COCO(annotation_file=val_annotate) val_coco = COCODetection(val_coco_root, val_annotate, val_trans) train_coco = COCODetection(train_coco_root, train_annotate, train_trans) #print("Number of labels: {}".format(train_coco.labelnum)) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_coco) else: train_sampler = None train_dataloader = DataLoader(train_coco, batch_size=args.batch_size, shuffle=(train_sampler is None), sampler=train_sampler, num_workers=4) # set shuffle=True in DataLoader ssd_print(key=mlperf_log.INPUT_SHARD, value=None) ssd_print(key=mlperf_log.INPUT_ORDER) ssd_print(key=mlperf_log.INPUT_BATCH_SIZE, value=args.batch_size) ssd300 = SSD300(train_coco.labelnum) if args.checkpoint is not None: print("loading model checkpoint", args.checkpoint) od = torch.load(args.checkpoint) ssd300.load_state_dict(od["model"]) ssd300.train() if use_cuda: ssd300.cuda() loss_func = Loss(dboxes) if use_cuda: loss_func.cuda() if args.distributed: N_gpu = torch.distributed.get_world_size() else: N_gpu = 1 # parallelize if args.distributed: ssd300 = DDP(ssd300) global_batch_size = N_gpu * args.batch_size current_lr = args.lr * (global_batch_size / 32) current_momentum = 0.9 current_weight_decay = 5e-4 optim = torch.optim.SGD(ssd300.parameters(), lr=current_lr, momentum=current_momentum, weight_decay=current_weight_decay) ssd_print(key=mlperf_log.OPT_NAME, value="SGD") ssd_print(key=mlperf_log.OPT_LR, value=current_lr) ssd_print(key=mlperf_log.OPT_MOMENTUM, value=current_momentum) ssd_print(key=mlperf_log.OPT_WEIGHT_DECAY, value=current_weight_decay) eval_points = args.evaluation print("epoch", "nbatch", "loss") iter_num = args.iteration avg_loss = 0.0 inv_map = {v: k for k, v in val_coco.label_map.items()} success = torch.zeros(1) if use_cuda: success = success.cuda() if args.warmup: nonempty_imgs = len(train_coco) wb = int(args.warmup * nonempty_imgs / (N_gpu * args.batch_size)) warmup_step = lambda iter_num, current_lr: lr_warmup( optim, wb, iter_num, current_lr, args) else: warmup_step = lambda iter_num, current_lr: None for epoch in range(args.epochs): ssd_print(key=mlperf_log.TRAIN_EPOCH, value=epoch) # set the epoch for the sampler if args.distributed: train_sampler.set_epoch(epoch) if epoch in args.lr_decay_schedule: current_lr *= 0.1 print("") print("lr decay step #{num}".format( num=args.lr_decay_schedule.index(epoch) + 1)) for param_group in optim.param_groups: param_group['lr'] = current_lr ssd_print(key=mlperf_log.OPT_LR, value=current_lr) for nbatch, (img, img_size, bbox, label) in enumerate(train_dataloader): if use_cuda: img = img.cuda() img = Variable(img, requires_grad=True) ploc, plabel = ssd300(img) trans_bbox = bbox.transpose(1, 2).contiguous() if use_cuda: trans_bbox = trans_bbox.cuda() label = label.cuda() gloc, glabel = Variable(trans_bbox, requires_grad=False), \ Variable(label, requires_grad=False) loss = loss_func(ploc, plabel, gloc, glabel) if not np.isinf(loss.item()): avg_loss = 0.999 * avg_loss + 0.001 * loss.item() print("Iteration: {:6d}, Loss function: {:5.3f}, Average Loss: {:.3f}"\ .format(iter_num, loss.item(), avg_loss), end="\r") optim.zero_grad() loss.backward() warmup_step(iter_num, current_lr) optim.step() iter_num += 1 if epoch + 1 in eval_points: rank = dist.get_rank() if args.distributed else args.local_rank if args.distributed: world_size = float(dist.get_world_size()) for bn_name, bn_buf in ssd300.module.named_buffers( recurse=True): if ('running_mean' in bn_name) or ('running_var' in bn_name): dist.all_reduce(bn_buf, op=dist.ReduceOp.SUM) bn_buf /= world_size if rank == 0: if not args.no_save: print("") print("saving model...") torch.save( { "model": ssd300.state_dict(), "label_map": train_coco.label_info }, "./models/iter_{}.pt".format(iter_num)) if coco_eval(ssd300, val_coco, cocoGt, encoder, inv_map, args.threshold, epoch + 1, iter_num): success = torch.ones(1) if use_cuda: success = success.cuda() if args.distributed: dist.broadcast(success, 0) if success[0]: return True return False