Exemple #1
0
def lemma_to_synset_key(keyin, keyout):
    for line in keyin:
        inst_id, lemma_ids = line.split(" ", 1)
        keyout.write(inst_id)
        for lemma_id in lemma_ids.split():
            keyout.write(
                " " + wordnet.ss2of(wordnet.lemma_from_key(lemma_id).synset()))
        keyout.write("\n")
Exemple #2
0
def get_in_id(wordnet_ss):
    """
    Transforms a worndet synset into a imagenet id
    Input: Synset
    :param wordnet_ss:
    :return: imagenet id (string)
    """
    wn_id = wn.ss2of(wordnet_ss)
    return wn_id[-1] + wn_id[:8]
Exemple #3
0
def main():
    global args, best_prec1, poinc_emb
    global imgnet_poinc_wgt, imgnet_labels
    args = parser.parse_args()

    if args.seed is not None:
        random.seed(args.seed)
        torch.manual_seed(args.seed)
        cudnn.deterministic = True
        warnings.warn('You have chosen to seed training. '
                      'This will turn on the CUDNN deterministic setting, '
                      'which can slow down your training considerably! '
                      'You may see unexpected behavior when restarting '
                      'from checkpoints.')

    if args.gpu is not None:
        warnings.warn('You have chosen a specific GPU. This will completely '
                      'disable data parallelism.')

    if args.emb_name is None:
        raise NameError('args.emb_name file not specified')

    #load Poincare embedding
    poinc_emb = torch.load(args.emb_dir + args.emb_name)
    print('EMBEDDING TYPE:', poinc_emb['conf']['manifold'])
    n_emb_dims = poinc_emb['embeddings'].shape[1]
    args.n_emb_dims = n_emb_dims
    print('NUM OF DIMENSIONS:', n_emb_dims)

    #change labels from synset names into imagenet format
    synset_list = [wn.synset(i) for i in poinc_emb['objects']]
    offset_list = [wn.ss2of(j) for j in synset_list]
    poinc_emb['objects'] = ['n' + i.split('-')[0] for i in offset_list]

    #settings for distributed training
    if args.dist_url == "env://" and args.world_size == -1:
        args.world_size = int(os.environ["WORLD_SIZE"])
    args.distributed = args.world_size > 1 or args.multiprocessing_distributed

    ngpus_per_node = torch.cuda.device_count()
    if args.multiprocessing_distributed:
        # Since we have ngpus_per_node processes per node, the total world_size
        # needs to be adjusted accordingly
        args.world_size = ngpus_per_node * args.world_size
        # Use torch.multiprocessing.spawn to launch distributed processes: the
        # main_worker process function
        mp.spawn(main_worker,
                 nprocs=ngpus_per_node,
                 args=(ngpus_per_node, args))
    else:
        # Simply call main_worker function
        main_worker(args.gpu, ngpus_per_node, args)
Exemple #4
0
def wordnet_batcher():
    all_synsets = wordnet.all_synsets()
    batch_size = get_batch_size()
    while 1:
        batch = islice(all_synsets, batch_size)
        ids = []
        defns = []
        for synset in batch:
            ids.append(wordnet.ss2of(synset))
            defns.append(synset.definition())
        if not ids:
            return
        yield ids, defns
Exemple #5
0
def main(pred_matrix, pb_defns, out, reject_non_english, use_model, synset):
    # Load mapping from English PropBank senses to English WordNet senses
    mapping = {}
    with open(pred_matrix, 'r') as matrix:
        matrix = csv.DictReader(matrix, delimiter='\t')
        for row in matrix:
            if reject_non_english and row['1_ID_LANG'] != 'id:eng':
                continue
            if row['11_WN_SENSE'] != 'wn:NULL':
                pb = row['16_PB_ROLESET'].split(':', 1)[1]
                wn = row['11_WN_SENSE'].split(':', 1)[1]
                mapping.setdefault(pb, set()).add(wn)

    # Join with mapping from Finnish to English PropBank
    with open(pb_defns, 'r') as propbank, open(out, 'w') as csvout:
        propbank = csv.DictReader(propbank, delimiter='\t')
        csvout = csv.writer(csvout)
        csvout.writerow(['pb', 'wn'])
        propbank = peekable(propbank)
        for row in propbank:
            pb_finn = "{}.{:0>2}".format(row['base'], row['number'])
            if use_model:
                match = MODEL_RE.match(row['note'])
                if match:
                    pb = match.group(1)
                else:
                    pb = None
            else:
                pb = row['link_original']
            if pb == 'none.01':
                pb = None
            if pb is not None and pb in mapping:
                for wn in mapping[pb]:
                    if synset:
                        csvout.writerow(
                            (pb_finn,
                             wordnet.ss2of(
                                 wordnet.lemma_from_key(wn + "::").synset())))
                    else:
                        csvout.writerow((pb_finn, wn))
Exemple #6
0
def main():

    #parse args
    global args
    args = parser.parse_args()
    if args.emb_file_name is None:
        raise NameError('args.emb_file_name is not specified')

    #GPU setting
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    cudnn.benchmark = True

    #import dataset
    embedding = torch.load(args.emb_dir+args.emb_file_name)
    print('EMBEDDING TYPE:', embedding['manifold'])
    n_emb_dims = embedding['embeddings'].shape[1]
    args.n_emb_dims = n_emb_dims
    print('NUM OF DIMENSIONS:', n_emb_dims)

    #change labels from synset names into imagenet format
    synset_list = [wn.synset(i) for i in embedding['objects']]
    offset_list = [wn.ss2of(j) for j in synset_list]
    embedding['objects'] = ['n'+i.split('-')[0] for i in offset_list]

    #load the CNN part of the model 
    print("=>using pre-trained model '{}'".format(args.arch))
    orig_vgg = models.__dict__[args.arch](pretrained=True)

    #change the model to project into desired embedding space
    if embedding['manifold'] == 'poincare':
        model = PoincareEmbVGG(orig_vgg, args.n_emb_dims)
    elif embedding['manifold'] == 'euclidean':
        model = EuclidEmbVGG(orig_vgg, args.n_emb_dims)
    model.to(device, non_blocking=True)
    model.features = torch.nn.DataParallel(model.features)

    #load weights from training on 1K classes
    if os.path.isfile(args.saved_weights):
        print("=> loading checkpoint '{}'".format(args.saved_weights))
        checkpoint = torch.load(args.saved_weights)
        model.load_state_dict(checkpoint['state_dict'])
        print("=> loaded checkpoint '{}'".format(args.saved_weights,
                                                 checkpoint['epoch']))
    else:
        print("=> no checkpoint found at '{}'".format(args.saved_weights))

    #data loading
    evaldir = '/mnt/fast-data15/datasets/imagenet/fa2011'
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    eval_dataset = datasets.ImageFolder(evaldir, transforms.Compose([
                                transforms.Resize(256),
                                transforms.CenterCrop(224),
                                transforms.ToTensor(),
                                normalize,]))

    eval_loader = torch.utils.data.DataLoader(
            eval_dataset, batch_size=args.batch_size, shuffle=False,
            num_workers=args.workers, pin_memory=True)

    #sort embedding to match image labels
    img_labels = eval_dataset.classes
    img2emb_idx = [embedding['objects'].index(i)
                   for i in img_labels]
    emb_wgts = embedding['embeddings'][img2emb_idx]
    emb_wgts = emb_wgts.float().to(device, non_blocking=True)
    n_classes = emb_wgts.shape[0]

    #load 21k class distance matrix
    class_distance_mat = torch.load('class_dist_mat.pt').to(device,
            non_blocking=True)
    class_distance_mat = class_distance_mat+torch.t(class_distance_mat)

    #trackers
    batch_time = AverageMeter('Time', ':6.3f')
    top5_pos_track = AverageMeter('Top5+', ':6.2f')
    top5_neg_track = AverageMeter('Top5-', ':6.2f')
    progress = ProgressMeter(
        len(eval_loader),
        [batch_time, top5_pos_track, top5_neg_track],
        prefix='Eval: ')

    #evaluate
    model.eval()
    with torch.no_grad():
        end = time.time()
        for i, (images, target) in enumerate(eval_loader):
            print(i)
            #if i <= 25329:
            #    continue
            if i == 40:
                pdb.set_trace()
            images = images.to(device, non_blocking=True)
            target = target.to(device, non_blocking=True)

            #compute output
            output = model(images)

            #evaluate
            preds = prediction(output, emb_wgts, 5, embedding['manifold'])
            target_dist_mat = class_distance_mat[target]
            top5_pos, top5_neg = calc_top5_pos_neg(preds, target_dist_mat)

            #track evaluation
            top5_pos_track.update(top5_pos, preds.shape[0])
            top5_neg_track.update(top5_neg, preds.shape[0])

            #measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()
            if i % args.print_freq == 0:
                progress.display(i)

        print(
              ' * Top5+ {top5_pos_track.avg: .3f} Top5- {top5_neg_track.avg:.3f}'.format(top5_pos_track=top5_pos_track, top5_neg_track=top5_neg_track))
Exemple #7
0
 def ss2of(self, key, synset):
     """Convert synset to offset
     """
     off = 'n' + wn.ss2of(synset)[:8]
     self._labels[key] = off
     return off
Exemple #8
0
def main():
    global args, best_prec1, poinc_emb
    global imgnet_poinc_wgt, imgnet_poinc_labels
    args = parser.parse_args()

    if args.seed is not None:
        random.seed(args.seed)
        torch.manual_seed(args.seed)
        cudnn.deterministic = True
        warnings.warn('You have chosen to seed training. '
                      'This will turn on the CUDNN deterministic setting, '
                      'which can slow down your training considerably! '
                      'You may see unexpected behavior when restarting '
                      'from checkpoints.')

    if args.gpu is not None:
        warnings.warn('You have chosen a specific GPU. This will completely '
                      'disable data parallelism.')

    if args.emb_name is None:
        raise NameError('args.emb_name file not specified')

    #load Lorentzian embedding
    lorentz_emb = torch.load(args.emb_dir + args.emb_name)
    print('EMBEDDING TYPE:', poinc_emb['manifold'])
    n_emb_dims = poinc_emb['embeddings'].shape[1] - 1
    print('NUM OF DIMENSIONS:', n_emb_dims)

    #change labels from synset names into imagenet format
    synset_list = [wn.synset(i) for i in poinc_emb['objects']]
    offset_list = [wn.ss2of(j) for j in synset_list]
    poinc_emb['objects'] = ['n' + i.split('-')[0] for i in offset_list]

    # create model
    if args.pretrained:
        print("=> using pre-trained model '{}'".format(args.arch))
        orig_vgg = models.__dict__[args.arch](pretrained=True)
    else:
        print("=> creating model '{}'".format(args.arch))
        orig_vgg = models.__dict__[args.arch]()

    #Change model to project into poincare space
    model = PoincareVGG(orig_vgg, n_emb_dims, args.unfreeze)

    if args.gpu is not None:
        model = model.cuda(args.gpu)
    else:
        if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
            model.features = torch.nn.DataParallel(model.features)
            model.cuda()
        else:
            model = torch.nn.DataParallel(model).cuda()
    cudnn.benchmark = True

    # define loss function (criterion) and optimizer
    criterion = LorentzXEntropyLoss()
    if args.unfreeze:
        optimizer = torch.optim.SGD([{
            'params': model.features.parameters(),
            'lr': args.lr * 10**-1
        }, {
            'params': model.fc.parameters()
        }, {
            'params': model.classifier.parameters()
        }],
                                    lr=args.lr,
                                    momentum=args.momentum,
                                    weight_decay=args.weight_decay)
    else:
        optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad,
                                           model.parameters()),
                                    args.lr,
                                    momentum=args.momentum,
                                    weight_decay=args.weight_decay)

    lr_sched = torch.optim.lr_scheduler.StepLR(
        optimizer, step_size=args.lr_decay_interval, gamma=args.lr_decay)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            best_prec1 = checkpoint['best_prec1']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            lr_sched.load_state_dict(checkpoint['scheduler'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    # Data loading code
    traindir = os.path.join(args.data, 'train')
    valdir = os.path.join(args.data, 'val_white')
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    train_dataset = datasets.ImageFolder(
        traindir,
        transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ]))

    #create poincare embedding that only contains imagenet synsets
    imgnet_poinc_labels = train_dataset.classes
    imgnet2poinc_idx = [
        poinc_emb['objects'].index(i) for i in imgnet_poinc_labels
    ]
    imgnet_poinc_wgt = poinc_emb['embeddings'][imgnet2poinc_idx]
    imgnet_poinc_wgt = imgnet_poinc_wgt.float().cuda(non_blocking=True)

    #create train and val data loaders
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               num_workers=args.workers,
                                               pin_memory=True,
                                               sampler=None)

    val_loader = torch.utils.data.DataLoader(datasets.ImageFolder(
        valdir,
        transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            normalize,
        ])),
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             num_workers=args.workers,
                                             pin_memory=True)

    if args.evaluate:
        validate(val_loader, model, criterion)
        return

    for epoch in range(args.start_epoch, args.epochs):
        lr_sched.step()

        # train the model
        train(train_loader, model, criterion, optimizer, epoch)

        # evaluate on validation set
        prec1 = validate(val_loader, model, criterion)

        # remember best prec@1 and save checkpoint
        is_best = prec1 > best_prec1
        best_prec1 = max(prec1, best_prec1)
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'arch': args.arch,
                'state_dict': model.state_dict(),
                'best_prec1': best_prec1,
                'optimizer': optimizer.state_dict(),
                'scheduler': lr_sched.state_dict(),
            }, is_best, args.emb_name + '_checkp.pth.tar')
Exemple #9
0
def get_synset_id(ss):
    if not ss:
        return None
    return wn.ss2of(ss)
Exemple #10
0
def lemma_id_to_synset_id(lemma_id):
    from nltk.corpus import wordnet
    return wordnet.ss2of(wordnet.lemma_from_key(lemma_id + "::").synset())
Exemple #11
0
# -*- coding: utf-8 -*-
"""
Get definition and examples from WordNet ID
"""
from nltk.corpus import wordnet as wn
word = input("WordNet ID : ")
word_wn = wn.of2ss(word.replace('-', ''))
print(word_wn.definition())
print(word_wn.examples())
print("WordNet ID : " + wn.ss2of(word_wn))