def lemma_to_synset_key(keyin, keyout): for line in keyin: inst_id, lemma_ids = line.split(" ", 1) keyout.write(inst_id) for lemma_id in lemma_ids.split(): keyout.write( " " + wordnet.ss2of(wordnet.lemma_from_key(lemma_id).synset())) keyout.write("\n")
def get_in_id(wordnet_ss): """ Transforms a worndet synset into a imagenet id Input: Synset :param wordnet_ss: :return: imagenet id (string) """ wn_id = wn.ss2of(wordnet_ss) return wn_id[-1] + wn_id[:8]
def main(): global args, best_prec1, poinc_emb global imgnet_poinc_wgt, imgnet_labels args = parser.parse_args() if args.seed is not None: random.seed(args.seed) torch.manual_seed(args.seed) cudnn.deterministic = True warnings.warn('You have chosen to seed training. ' 'This will turn on the CUDNN deterministic setting, ' 'which can slow down your training considerably! ' 'You may see unexpected behavior when restarting ' 'from checkpoints.') if args.gpu is not None: warnings.warn('You have chosen a specific GPU. This will completely ' 'disable data parallelism.') if args.emb_name is None: raise NameError('args.emb_name file not specified') #load Poincare embedding poinc_emb = torch.load(args.emb_dir + args.emb_name) print('EMBEDDING TYPE:', poinc_emb['conf']['manifold']) n_emb_dims = poinc_emb['embeddings'].shape[1] args.n_emb_dims = n_emb_dims print('NUM OF DIMENSIONS:', n_emb_dims) #change labels from synset names into imagenet format synset_list = [wn.synset(i) for i in poinc_emb['objects']] offset_list = [wn.ss2of(j) for j in synset_list] poinc_emb['objects'] = ['n' + i.split('-')[0] for i in offset_list] #settings for distributed training if args.dist_url == "env://" and args.world_size == -1: args.world_size = int(os.environ["WORLD_SIZE"]) args.distributed = args.world_size > 1 or args.multiprocessing_distributed ngpus_per_node = torch.cuda.device_count() if args.multiprocessing_distributed: # Since we have ngpus_per_node processes per node, the total world_size # needs to be adjusted accordingly args.world_size = ngpus_per_node * args.world_size # Use torch.multiprocessing.spawn to launch distributed processes: the # main_worker process function mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args)) else: # Simply call main_worker function main_worker(args.gpu, ngpus_per_node, args)
def wordnet_batcher(): all_synsets = wordnet.all_synsets() batch_size = get_batch_size() while 1: batch = islice(all_synsets, batch_size) ids = [] defns = [] for synset in batch: ids.append(wordnet.ss2of(synset)) defns.append(synset.definition()) if not ids: return yield ids, defns
def main(pred_matrix, pb_defns, out, reject_non_english, use_model, synset): # Load mapping from English PropBank senses to English WordNet senses mapping = {} with open(pred_matrix, 'r') as matrix: matrix = csv.DictReader(matrix, delimiter='\t') for row in matrix: if reject_non_english and row['1_ID_LANG'] != 'id:eng': continue if row['11_WN_SENSE'] != 'wn:NULL': pb = row['16_PB_ROLESET'].split(':', 1)[1] wn = row['11_WN_SENSE'].split(':', 1)[1] mapping.setdefault(pb, set()).add(wn) # Join with mapping from Finnish to English PropBank with open(pb_defns, 'r') as propbank, open(out, 'w') as csvout: propbank = csv.DictReader(propbank, delimiter='\t') csvout = csv.writer(csvout) csvout.writerow(['pb', 'wn']) propbank = peekable(propbank) for row in propbank: pb_finn = "{}.{:0>2}".format(row['base'], row['number']) if use_model: match = MODEL_RE.match(row['note']) if match: pb = match.group(1) else: pb = None else: pb = row['link_original'] if pb == 'none.01': pb = None if pb is not None and pb in mapping: for wn in mapping[pb]: if synset: csvout.writerow( (pb_finn, wordnet.ss2of( wordnet.lemma_from_key(wn + "::").synset()))) else: csvout.writerow((pb_finn, wn))
def main(): #parse args global args args = parser.parse_args() if args.emb_file_name is None: raise NameError('args.emb_file_name is not specified') #GPU setting device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") cudnn.benchmark = True #import dataset embedding = torch.load(args.emb_dir+args.emb_file_name) print('EMBEDDING TYPE:', embedding['manifold']) n_emb_dims = embedding['embeddings'].shape[1] args.n_emb_dims = n_emb_dims print('NUM OF DIMENSIONS:', n_emb_dims) #change labels from synset names into imagenet format synset_list = [wn.synset(i) for i in embedding['objects']] offset_list = [wn.ss2of(j) for j in synset_list] embedding['objects'] = ['n'+i.split('-')[0] for i in offset_list] #load the CNN part of the model print("=>using pre-trained model '{}'".format(args.arch)) orig_vgg = models.__dict__[args.arch](pretrained=True) #change the model to project into desired embedding space if embedding['manifold'] == 'poincare': model = PoincareEmbVGG(orig_vgg, args.n_emb_dims) elif embedding['manifold'] == 'euclidean': model = EuclidEmbVGG(orig_vgg, args.n_emb_dims) model.to(device, non_blocking=True) model.features = torch.nn.DataParallel(model.features) #load weights from training on 1K classes if os.path.isfile(args.saved_weights): print("=> loading checkpoint '{}'".format(args.saved_weights)) checkpoint = torch.load(args.saved_weights) model.load_state_dict(checkpoint['state_dict']) print("=> loaded checkpoint '{}'".format(args.saved_weights, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.saved_weights)) #data loading evaldir = '/mnt/fast-data15/datasets/imagenet/fa2011' normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) eval_dataset = datasets.ImageFolder(evaldir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize,])) eval_loader = torch.utils.data.DataLoader( eval_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) #sort embedding to match image labels img_labels = eval_dataset.classes img2emb_idx = [embedding['objects'].index(i) for i in img_labels] emb_wgts = embedding['embeddings'][img2emb_idx] emb_wgts = emb_wgts.float().to(device, non_blocking=True) n_classes = emb_wgts.shape[0] #load 21k class distance matrix class_distance_mat = torch.load('class_dist_mat.pt').to(device, non_blocking=True) class_distance_mat = class_distance_mat+torch.t(class_distance_mat) #trackers batch_time = AverageMeter('Time', ':6.3f') top5_pos_track = AverageMeter('Top5+', ':6.2f') top5_neg_track = AverageMeter('Top5-', ':6.2f') progress = ProgressMeter( len(eval_loader), [batch_time, top5_pos_track, top5_neg_track], prefix='Eval: ') #evaluate model.eval() with torch.no_grad(): end = time.time() for i, (images, target) in enumerate(eval_loader): print(i) #if i <= 25329: # continue if i == 40: pdb.set_trace() images = images.to(device, non_blocking=True) target = target.to(device, non_blocking=True) #compute output output = model(images) #evaluate preds = prediction(output, emb_wgts, 5, embedding['manifold']) target_dist_mat = class_distance_mat[target] top5_pos, top5_neg = calc_top5_pos_neg(preds, target_dist_mat) #track evaluation top5_pos_track.update(top5_pos, preds.shape[0]) top5_neg_track.update(top5_neg, preds.shape[0]) #measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: progress.display(i) print( ' * Top5+ {top5_pos_track.avg: .3f} Top5- {top5_neg_track.avg:.3f}'.format(top5_pos_track=top5_pos_track, top5_neg_track=top5_neg_track))
def ss2of(self, key, synset): """Convert synset to offset """ off = 'n' + wn.ss2of(synset)[:8] self._labels[key] = off return off
def main(): global args, best_prec1, poinc_emb global imgnet_poinc_wgt, imgnet_poinc_labels args = parser.parse_args() if args.seed is not None: random.seed(args.seed) torch.manual_seed(args.seed) cudnn.deterministic = True warnings.warn('You have chosen to seed training. ' 'This will turn on the CUDNN deterministic setting, ' 'which can slow down your training considerably! ' 'You may see unexpected behavior when restarting ' 'from checkpoints.') if args.gpu is not None: warnings.warn('You have chosen a specific GPU. This will completely ' 'disable data parallelism.') if args.emb_name is None: raise NameError('args.emb_name file not specified') #load Lorentzian embedding lorentz_emb = torch.load(args.emb_dir + args.emb_name) print('EMBEDDING TYPE:', poinc_emb['manifold']) n_emb_dims = poinc_emb['embeddings'].shape[1] - 1 print('NUM OF DIMENSIONS:', n_emb_dims) #change labels from synset names into imagenet format synset_list = [wn.synset(i) for i in poinc_emb['objects']] offset_list = [wn.ss2of(j) for j in synset_list] poinc_emb['objects'] = ['n' + i.split('-')[0] for i in offset_list] # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) orig_vgg = models.__dict__[args.arch](pretrained=True) else: print("=> creating model '{}'".format(args.arch)) orig_vgg = models.__dict__[args.arch]() #Change model to project into poincare space model = PoincareVGG(orig_vgg, n_emb_dims, args.unfreeze) if args.gpu is not None: model = model.cuda(args.gpu) else: if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() cudnn.benchmark = True # define loss function (criterion) and optimizer criterion = LorentzXEntropyLoss() if args.unfreeze: optimizer = torch.optim.SGD([{ 'params': model.features.parameters(), 'lr': args.lr * 10**-1 }, { 'params': model.fc.parameters() }, { 'params': model.classifier.parameters() }], lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) else: optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) lr_sched = torch.optim.lr_scheduler.StepLR( optimizer, step_size=args.lr_decay_interval, gamma=args.lr_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) lr_sched.load_state_dict(checkpoint['scheduler']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val_white') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) #create poincare embedding that only contains imagenet synsets imgnet_poinc_labels = train_dataset.classes imgnet2poinc_idx = [ poinc_emb['objects'].index(i) for i in imgnet_poinc_labels ] imgnet_poinc_wgt = poinc_emb['embeddings'][imgnet2poinc_idx] imgnet_poinc_wgt = imgnet_poinc_wgt.float().cuda(non_blocking=True) #create train and val data loaders train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True, sampler=None) val_loader = torch.utils.data.DataLoader(datasets.ImageFolder( valdir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) if args.evaluate: validate(val_loader, model, criterion) return for epoch in range(args.start_epoch, args.epochs): lr_sched.step() # train the model train(train_loader, model, criterion, optimizer, epoch) # evaluate on validation set prec1 = validate(val_loader, model, criterion) # remember best prec@1 and save checkpoint is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), 'scheduler': lr_sched.state_dict(), }, is_best, args.emb_name + '_checkp.pth.tar')
def get_synset_id(ss): if not ss: return None return wn.ss2of(ss)
def lemma_id_to_synset_id(lemma_id): from nltk.corpus import wordnet return wordnet.ss2of(wordnet.lemma_from_key(lemma_id + "::").synset())
# -*- coding: utf-8 -*- """ Get definition and examples from WordNet ID """ from nltk.corpus import wordnet as wn word = input("WordNet ID : ") word_wn = wn.of2ss(word.replace('-', '')) print(word_wn.definition()) print(word_wn.examples()) print("WordNet ID : " + wn.ss2of(word_wn))