def get_model(text_proc, args): sent_vocab = text_proc.vocab model = ActionPropDenseCap(d_model=args.d_model, d_hidden=args.d_hidden, n_layers=args.n_layers, n_heads=args.n_heads, vocab=sent_vocab, in_emb_dropout=args.in_emb_dropout, attn_dropout=args.attn_dropout, vis_emb_dropout=args.vis_emb_dropout, cap_dropout=args.cap_dropout, nsamples=0, kernel_list=args.kernel_list, stride_factor=args.stride_factor, learn_mask=args.learn_mask) # Initialize the networks and the criterion if len(args.start_from) > 0: print("Initializing weights from {}".format(args.start_from)) model.load_state_dict( torch.load(args.start_from, map_location=lambda storage, location: storage)) # Ship the model to GPU, maybe if args.cuda: model.cuda() return model
def get_model(text_proc, args): sent_vocab = text_proc.vocab model = ActionPropDenseCap(d_model=args.d_model, d_hidden=args.d_hidden, n_layers=args.n_layers, n_heads=args.n_heads, vocab=sent_vocab, in_emb_dropout=args.in_emb_dropout, attn_dropout=args.attn_dropout, vis_emb_dropout=args.vis_emb_dropout, cap_dropout=args.cap_dropout, nsamples=args.train_sample, kernel_list=args.kernel_list, stride_factor=args.stride_factor, learn_mask=args.mask_weight > 0) # Initialize the networks and the criterion if len(args.start_from) > 0: print("Initializing weights from {}".format(args.start_from)) model.load_state_dict( torch.load(args.start_from, map_location=lambda storage, location: storage)) # Ship the model to GPU, maybe if args.cuda: if args.distributed: model.cuda() model = torch.nn.parallel.DistributedDataParallel(model) else: model = torch.nn.DataParallel(model).cuda() # elif torch.cuda.device_count() > 1: # model = torch.nn.DataParallel(model).cuda() # else: # model.cuda() return model
def get_model(text_proc, args): sent_vocab = text_proc.vocab # 字典 model = ActionPropDenseCap( d_model=args.d_model, d_hidden=args.d_hidden, n_layers=args.n_layers, n_heads=args.n_heads, vocab=sent_vocab, # 字典对象 in_emb_dropout=args.in_emb_dropout, # 0.1 attn_dropout=args.attn_dropout, # 0.2 vis_emb_dropout=args.vis_emb_dropout, # 0.1 cap_dropout=args.cap_dropout, # 0.2 nsamples=args.train_sample, # 20 kernel_list=args.kernel_list, stride_factor=args.stride_factor, learn_mask=args.mask_weight > 0) # Initialize the networks and the criterion if len(args.start_from) > 0: print("Initializing weights from {}".format(args.start_from)) model.load_state_dict( torch.load(args.start_from, map_location=lambda storage, location: storage)) # Ship the model to GPU, maybe if args.cuda: if args.distributed: model.cuda() """ 在多机多卡情况下分布式训练数据的读取也是一个问题,不同的卡读取到的数据应该是不同的。 dataparallel(单机多卡)的做法是直接将batch切分到不同的卡,这种方法对于多机来说不可取,因为多 机之间直接进行数据传输会严重影响效率。于是有了利用sampler确保dataloader只会load到 整个数据集的一个特定子集的做法。DistributedSampler(多机多卡)就是做这件事的。它为每一个子进程 划分出一部分数据集,以避免不同进程之间数据重复。 """ model = torch.nn.parallel.DistributedDataParallel(model) else: model = torch.nn.DataParallel(model).cuda() # 单机多卡 # elif torch.cuda.device_count() > 1: # model = torch.nn.DataParallel(model).cuda() # else: # model.cuda() return model