Esempio n. 1
0
def make_env_and_models(args, train_vocab_path, train_splits, test_splits,
                        test_instruction_limit=None):
    setup()
    image_features_list = ImageFeatures.from_args(args)
    vocab = read_vocab(train_vocab_path)
    tok = Tokenizer(vocab=vocab)
    train_env = R2RBatch(image_features_list, batch_size=batch_size,
                         splits=train_splits, tokenizer=tok)

    enc_hidden_size = hidden_size//2 if bidirectional else hidden_size
    glove = np.load(glove_path)
    feature_size = FEATURE_SIZE
    encoder = try_cuda(SpeakerEncoderLSTM(
        action_embedding_size, feature_size, enc_hidden_size, dropout_ratio,
        bidirectional=bidirectional))
    decoder = try_cuda(SpeakerDecoderLSTM(
        len(vocab), word_embedding_size, hidden_size, dropout_ratio,
        glove=glove))

    test_envs = {
        split: (R2RBatch(image_features_list, batch_size=batch_size,
                         splits=[split], tokenizer=tok,
                         instruction_limit=test_instruction_limit),
                eval_speaker.SpeakerEvaluation(
                    [split], instructions_per_path=test_instruction_limit))
        for split in test_splits}

    return train_env, test_envs, encoder, decoder
Esempio n. 2
0
def make_speaker(args):
    enc_hidden_size = hidden_size//2 if bidirectional else hidden_size
    glove = np.load(glove_path)
    feature_size = FEATURE_SIZE
    vocab = read_vocab(TRAIN_VOCAB)
    encoder = try_cuda(SpeakerEncoderLSTM(
        action_embedding_size, feature_size, enc_hidden_size, dropout_ratio,
        bidirectional=bidirectional))
    decoder = try_cuda(SpeakerDecoderLSTM(
        len(vocab), word_embedding_size, hidden_size, dropout_ratio,
        glove=glove))
    agent = Seq2SeqSpeaker(
        None, "", encoder, decoder, MAX_INSTRUCTION_LENGTH)
    return agent
Esempio n. 3
0
def make_scorer(args):
    bidirectional = args.bidirectional
    enc_hidden_size = hidden_size//2 if bidirectional else hidden_size
    feature_size = FEATURE_SIZE
    traj_encoder = try_cuda(SpeakerEncoderLSTM(action_embedding_size, feature_size,
                                          enc_hidden_size, dropout_ratio, bidirectional=args.bidirectional))
    scorer_module = try_cuda(DotScorer(enc_hidden_size, enc_hidden_size))
    scorer = Scorer(scorer_module, traj_encoder)
    if args.load_scorer is not '':
        scorer.load(args.load_scorer)
        print(colorize('load scorer traj '+ args.load_scorer))
    elif args.load_traj_encoder is not '':
        scorer.load_traj_encoder(args.load_traj_encoder)
        print(colorize('load traj encoder '+ args.load_traj_encoder))
    return scorer
Esempio n. 4
0
def make_speaker(args, action_embedding_size=-1, feature_size=-1):
    enc_hidden_size = hidden_size // 2 if bidirectional else hidden_size
    wordvec = np.load(args.wordvec_path)

    vocab = read_vocab(TRAIN_VOCAB, args.language)
    encoder = try_cuda(
        SpeakerEncoderLSTM(action_embedding_size,
                           feature_size,
                           enc_hidden_size,
                           dropout_ratio,
                           bidirectional=bidirectional))
    decoder = try_cuda(
        SpeakerDecoderLSTM(len(vocab),
                           word_embedding_size,
                           hidden_size,
                           dropout_ratio,
                           wordvec=wordvec,
                           wordvec_finetune=args.wordvec_finetune))
    agent = Seq2SeqSpeaker(None, "", encoder, decoder, MAX_INSTRUCTION_LENGTH)
    return agent
Esempio n. 5
0
def make_speaker(args, action_embedding_size=-1, feature_size=-1):
    enc_hidden_size = args.hidden_size // 2 if args.bidirectional else args.hidden_size
    wordvec = np.load(args.wordvec_path)

    vocab = read_vocab(TRAIN_VOCAB, args.language)
    word_embedding_size = get_word_embedding_size(args)
    encoder = try_cuda(
        SpeakerEncoderLSTM(action_embedding_size,
                           feature_size,
                           enc_hidden_size,
                           args.dropout_ratio,
                           bidirectional=args.bidirectional))
    decoder = try_cuda(
        SpeakerDecoderLSTM(len(vocab),
                           word_embedding_size,
                           args.hidden_size,
                           args.dropout_ratio,
                           wordvec=wordvec,
                           wordvec_finetune=args.wordvec_finetune))
    agent = Seq2SeqSpeaker(None, "", encoder, decoder, args.max_input_length)
    return agent
Esempio n. 6
0
weight_decay = 0.0005
#weight_decay = 0.0001
FEATURE_SIZE = 2048+128
n_iters = 5000
log_every = 100
save_every = 100

vocab = read_vocab(TRAIN_VOCAB)
tok = Tokenizer(vocab=vocab)
glove = np.load(glove_path)

enc_hidden_size = hidden_size//2 if bidirectional else hidden_size
feature_size = FEATURE_SIZE

visEncoder = try_cuda(SpeakerEncoderLSTM(
        action_embedding_size, feature_size, enc_hidden_size, dropout_ratio,
        bidirectional=bidirectional))    
lanEncoder = try_cuda(EncoderLSTM(
        len(vocab), word_embedding_size, enc_hidden_size, vocab_pad_idx,
        dropout_ratio, bidirectional=False, glove=glove))
dotSim = try_cuda(dotSimilarity(batch_size, enc_hidden_size))

agent = compatModel(None, "", visEncoder, lanEncoder, dotSim)
#agent.load('tasks/R2R/snapshots/release/speaker_final_release', map_location = 'cpu')
agent.load('tasks/R2R/compat/trained_1/compat_sample_imagenet_mean_pooled_train_iter_1000', map_location = 'cpu')
if __name__ == "__main__":
    traj = {'scan':'5q7pvUzZiYa', 'path':["7dc12a67ddfc4a4a849ce620db5b777b", "0e84cf4dec784bc28b78a80bee35c550", "a77784b955454209857d745976a1676d", "67971a17c26f4e2ca117b4fca73507fe", "8db06d3a0dd44508b3c078d60126ce19", "43ac37dfa1db4a13a8a9df4e454eb016", "4bd82c990a6548a994daa97c8f52db06", "6d11ca4d41e04bb1a725c2223c36b2aa", "29fb3c58b29348558d36a9f9440a1379", "c23f26401359426982d11ca494ee739b", "397403366d784caf804d741f32fd68b9", "3c6a35e15ada4b649990d6568cce8bd9", "55e4436f528c4bf09e4550079c572f7b", "69fad7dd177847dbabf69e8fb7c00ddf", "c629c7f1cf6f47a78c45a8ae9ff82247", "21fca0d6192940e580587fe317440f56", "4b85d61dd3a94e8a812affe78f3a322d", "3c025b8e3d2040969cd00dd0e9f29b09"][:2], 'heading':0.0,'elevation_init':0.0}
    encoded_instructions, _ = tok.encode_sentence('')
    encoded_instructions = torch.tensor([encoded_instructions], device = 'cpu')
    rdv_test = rdv(traj)
    
Esempio n. 7
0
def make_env_and_models(args,
                        train_vocab_path,
                        train_splits,
                        test_splits,
                        test_instruction_limit=None):
    setup()
    image_features_list = ImageFeatures.from_args(args)
    vocab = read_vocab(train_vocab_path)
    tok = Tokenizer(vocab=vocab)
    train_env = R2RBatch(image_features_list,
                         batch_size=batch_size,
                         splits=train_splits,
                         tokenizer=tok)

    train_env.data.extend(hardNeg_train)  # extend train data and shuffle
    random.shuffle(train_env.data)

    enc_hidden_size = hidden_size // 2 if bidirectional else hidden_size
    glove = np.load(glove_path)
    feature_size = FEATURE_SIZE

    # =============================================================================
    #     visEncoder = try_cuda(CompatVisEncoderLSTM(
    #         action_embedding_size, feature_size, enc_hidden_size, dropout_ratio,
    #         bidirectional=bidirectional))
    # =============================================================================
    visEncoder = try_cuda(
        SpeakerEncoderLSTM(action_embedding_size,
                           feature_size,
                           enc_hidden_size,
                           dropout_ratio,
                           bidirectional=bidirectional))
    # =============================================================================
    #     lanEncoder = try_cuda(CompatLanEncoderLSTM(
    #         len(vocab), word_embedding_size, enc_hidden_size, vocab_pad_idx,
    #         dropout_ratio, bidirectional=True, glove=glove))
    # =============================================================================
    lanEncoder = try_cuda(
        EncoderLSTM(len(vocab),
                    word_embedding_size,
                    enc_hidden_size,
                    vocab_pad_idx,
                    dropout_ratio,
                    bidirectional=False,
                    glove=glove))
    dotSim = try_cuda(dotSimilarity(batch_size, enc_hidden_size))
    #visEncoder.load_state_dict(torch.load('tasks/R2R/snapshots/release/speaker_final_release_enc'))
    #lanEncoder.load_state_dict(torch.load('tasks/R2R/snapshots/release/follower_final_release_enc'))

    test_envs = {
        split: (R2RBatch(image_features_list,
                         batch_size=batch_size,
                         splits=[split],
                         tokenizer=tok,
                         instruction_limit=test_instruction_limit),
                eval_speaker.SpeakerEvaluation(
                    [split], instructions_per_path=test_instruction_limit))
        for split in test_splits
    }

    #test_envs['val_seen'][0].data.extend(hardNeg_val_seen)
    test_envs['val_unseen'][0].data.extend(hardNeg_val_unseen)
    test_envs['val_unseen'][0].data = test_envs['val_unseen'][0].data[
        3000:4000]
    return train_env, test_envs, visEncoder, lanEncoder, dotSim
Esempio n. 8
0
def make_env_and_models(args,
                        train_vocab_path,
                        train_splits,
                        test_splits,
                        test_instruction_limit=None,
                        instructions_per_path=None):
    setup()
    if args.env == 'r2r':
        EnvBatch = R2RBatch
        ImgFeatures = ImageFeatures
    elif args.env == 'refer360':
        EnvBatch = Refer360Batch
        ImgFeatures = Refer360ImageFeatures
    else:
        raise NotImplementedError(
            'this {} environment is not implemented.'.format(args.env))

    image_features_list = ImgFeatures.from_args(args)
    feature_size = sum(
        [featurizer.feature_dim for featurizer in image_features_list]) + 128
    if args.use_visited_embeddings:
        feature_size += 64
    if args.use_oracle_embeddings:
        feature_size += 64
    action_embedding_size = feature_size

    vocab = read_vocab(train_vocab_path, args.language)
    tok = Tokenizer(vocab=vocab)

    train_env = EnvBatch(image_features_list,
                         splits=train_splits,
                         tokenizer=tok,
                         args=args)

    enc_hidden_size = args.hidden_size // 2 if args.bidirectional else args.hidden_size
    wordvec = np.load(args.wordvec_path)

    word_embedding_size = get_word_embedding_size(args)
    enc_hidden_size = 600  # refer360 >>>
    enc_hidden_size = 512  # refer360 >>>
    # enc_hidden_size = 512  # r2r >>>

    encoder = try_cuda(
        SpeakerEncoderLSTM(action_embedding_size,
                           feature_size,
                           enc_hidden_size,
                           args.dropout_ratio,
                           bidirectional=args.bidirectional))
    word_embedding_size = 300  # refer360 >>>>
    word_embedding_size = 300  # r2r >>>>
    hidden_size = 600  # refer360 >>>
    hidden_size = 512  # refer360 >>>
    # hidden_size = 512  # >>> r2r
    #hidden_size = args.hidden_size

    decoder = try_cuda(
        SpeakerDecoderLSTM(len(vocab),
                           word_embedding_size,
                           hidden_size,
                           args.dropout_ratio,
                           wordvec=wordvec,
                           wordvec_finetune=args.wordvec_finetune))

    test_envs = {}
    for split in test_splits:
        b = EnvBatch(image_features_list,
                     splits=[split],
                     tokenizer=tok,
                     args=args)
        e = eval_speaker.SpeakerEvaluation(
            [split], instructions_per_path=instructions_per_path, args=args)
        test_envs[split] = (b, e)

    # TODO
    # test_envs = {
    #     split: (BatchEnv(image_features_list, batch_size=batch_size,
    #                      splits=[split], tokenizer=tok,
    #                      instruction_limit=test_instruction_limit,
    #                      prefix=args.prefix),
    #             eval_speaker.SpeakerEvaluation(
    #                 [split], instructions_per_path=instructions_per_path, ))
    #     for split in test_splits}

    return train_env, test_envs, encoder, decoder
Esempio n. 9
0
glove_path = 'tasks/R2R/data/train_glove.npy'
action_embedding_size = 2048 + 128
hidden_size = 512
bidirectional = False
dropout_ratio = 0.5
feedback_method = 'sample'  # teacher or sample
learning_rate = 0.0001
weight_decay = 0.0005
feature_size = 2048 + 128
glove = np.load(glove_path)

vocab = read_vocab(TRAIN_VOCAB)
tok = Tokenizer(vocab=vocab)

encoder = try_cuda(
    SpeakerEncoderLSTM(action_embedding_size, feature_size, hidden_size,
                       dropout_ratio))

decoder = try_cuda(
    SpeakerDecoderLSTM(len(vocab),
                       word_embedding_size,
                       hidden_size,
                       dropout_ratio,
                       glove=glove))
agent = Seq2SeqSpeaker(tok, "", encoder, decoder, MAX_INSTRUCTION_LENGTH)
agent.load('tasks/R2R/snapshots/release/speaker_final_release',
           map_location='cpu')
if __name__ == "__main__":
    traj = {
        'scan':
        '5q7pvUzZiYa',
        'path': [