Esempio n. 1
0
def make_env_and_models(args, train_vocab_path, train_splits, test_splits,
                        test_instruction_limit=None):
    setup()
    image_features_list = ImageFeatures.from_args(args)
    vocab = read_vocab(train_vocab_path)
    tok = Tokenizer(vocab=vocab)
    train_env = R2RBatch(image_features_list, batch_size=batch_size,
                         splits=train_splits, tokenizer=tok)

    enc_hidden_size = hidden_size//2 if bidirectional else hidden_size
    glove = np.load(glove_path)
    feature_size = FEATURE_SIZE
    encoder = try_cuda(SpeakerEncoderLSTM(
        action_embedding_size, feature_size, enc_hidden_size, dropout_ratio,
        bidirectional=bidirectional))
    decoder = try_cuda(SpeakerDecoderLSTM(
        len(vocab), word_embedding_size, hidden_size, dropout_ratio,
        glove=glove))

    test_envs = {
        split: (R2RBatch(image_features_list, batch_size=batch_size,
                         splits=[split], tokenizer=tok,
                         instruction_limit=test_instruction_limit),
                eval_speaker.SpeakerEvaluation(
                    [split], instructions_per_path=test_instruction_limit))
        for split in test_splits}

    return train_env, test_envs, encoder, decoder
Esempio n. 2
0
def make_env_and_models(args, train_vocab_path, train_splits, test_splits):
    setup(args.seed)
    image_features_list = ImageFeatures.from_args(args)
    if args.job == None:  # create vocab only during training (job == none)
        vocab = build_vocab(train_splits)
        write_vocab(vocab, TRAIN_VOCAB)

    vocab = read_vocab(train_vocab_path)
    tok = Tokenizer(vocab=vocab)
    train_env = R2RBatch(image_features_list,
                         batch_size=args.batch_size,
                         splits=train_splits,
                         tokenizer=tok) if len(train_splits) > 0 else None
    test_envs = {
        split: (R2RBatch(image_features_list,
                         batch_size=args.batch_size,
                         splits=[split],
                         tokenizer=tok), Evaluation(split, args.instrType))
        for split in test_splits
    }

    agent = make_follower(args, vocab)
    agent.env = train_env

    if args.useObjLabelOrVis in ['label', 'both']:
        if not train_env is None:
            agent.pointer.wtoi = train_env.wtoi
        else:
            agent.pointer.wtoi = test_envs[test_splits[0]][0].wtoi

    return train_env, test_envs, agent
Esempio n. 3
0
def make_env_and_models(args, train_vocab_path, train_splits, test_splits,
                        batch_size=BATCH_SIZE):
    setup()
    image_features_list = ImageFeatures.from_args(args)
    vocab = read_vocab(train_vocab_path)
    tok = Tokenizer(vocab=vocab)
    train_env = R2RBatch(image_features_list, batch_size=batch_size,
                         splits=train_splits, tokenizer=tok)

    enc_hidden_size = hidden_size//2 if args.bidirectional else hidden_size
    glove = np.load(glove_path)
    feature_size = FEATURE_SIZE
    encoder = try_cuda(EncoderLSTM(
        len(vocab), word_embedding_size, enc_hidden_size, vocab_pad_idx,
        dropout_ratio, bidirectional=args.bidirectional, glove=glove))
    decoder = try_cuda(AttnDecoderLSTM(
        action_embedding_size, hidden_size, dropout_ratio,
        feature_size=feature_size))
    test_envs = {
        split: (R2RBatch(image_features_list, batch_size=batch_size,
                         splits=[split], tokenizer=tok),
                eval.Evaluation([split]))
        for split in test_splits}

    return train_env, test_envs, encoder, decoder
Esempio n. 4
0
def make_more_train_env(args, train_vocab_path, train_splits):
    setup(args.seed)
    image_features_list = ImageFeatures.from_args(args)
    vocab = read_vocab(train_vocab_path)
    tok = Tokenizer(vocab=vocab)
    train_env = R2RBatch(image_features_list, batch_size=args.batch_size,
                         splits=train_splits, tokenizer=tok)
    return train_env
Esempio n. 5
0
def eval_simple_agents(args):
    ''' Run simple baselines on each split. '''
    img_features = ImageFeatures.from_args(args)
    for split in ['train', 'val_seen', 'val_unseen', 'test']:
        env = R2RBatch(img_features, batch_size=1, splits=[split])
        ev = Evaluation([split])

        for agent_type in ['Stop', 'Shortest', 'Random']:
            outfile = '%s%s_%s_agent.json' % (train.RESULT_DIR, split,
                                              agent_type.lower())
            agent = BaseAgent.get_agent(agent_type)(env, outfile)
            agent.test()
            agent.write_results()
            score_summary, _ = ev.score_file(outfile)
            print('\n%s' % agent_type)
            pp.pprint(score_summary)
Esempio n. 6
0
def make_env_and_models(args, train_vocab_path, train_splits, test_splits):
    setup(args.seed)
    image_features_list = ImageFeatures.from_args(args)
    vocab = read_vocab(train_vocab_path)
    tok = Tokenizer(vocab=vocab)
    train_env = R2RBatch(image_features_list, batch_size=args.batch_size,
                         splits=train_splits, tokenizer=tok) if len(train_splits) > 0 else None
    test_envs = {
        split: (R2RBatch(image_features_list, batch_size=args.batch_size,
                         splits=[split], tokenizer=tok),
                eval.Evaluation([split]))
        for split in test_splits}

    agent = make_follower(args, vocab)
    agent.env = train_env

    return train_env, test_envs, agent
Esempio n. 7
0
from vocab import SUBTRAIN_VOCAB, TRAIN_VOCAB, TRAINVAL_VOCAB
import argparse
parser = argparse.ArgumentParser()
from env import ImageFeatures
ImageFeatures.add_args(parser)
import utils
from utils import read_vocab, Tokenizer, timeSince, try_cuda, vocab_pad_idx
from env import R2RBatch, ImageFeatures

from vocab import SUBTRAIN_VOCAB, TRAIN_VOCAB, TRAINVAL_VOCAB
from model import CompatLanEncoderLSTM, CompatVisEncoderLSTM, dotSimilarity, EncoderLSTM,SpeakerEncoderLSTM
from compatModel import compatModel

args, _ = parser.parse_known_args()

image_features_list= ImageFeatures.from_args(args)
angle_inc = np.pi / 6.
def build_viewpoint_loc_embedding(viewIndex):
    """
    Position embedding:
    heading 64D + elevation 64D
    1) heading: [sin(heading) for _ in range(1, 33)] +
                [cos(heading) for _ in range(1, 33)]
    2) elevation: [sin(elevation) for _ in range(1, 33)] +
                  [cos(elevation) for _ in range(1, 33)]
    """
    embedding = np.zeros((36, 128), np.float32)
    for absViewIndex in range(36):
        relViewIndex = (absViewIndex - viewIndex) % 12 + (absViewIndex // 12) * 12
        rel_heading = (relViewIndex % 12) * angle_inc
        rel_elevation = (relViewIndex // 12 - 1) * angle_inc
Esempio n. 8
0
def make_env_and_models(args,
                        train_vocab_path,
                        train_splits,
                        test_splits,
                        test_instruction_limit=None):
    setup()
    image_features_list = ImageFeatures.from_args(args)
    vocab = read_vocab(train_vocab_path)
    tok = Tokenizer(vocab=vocab)
    train_env = R2RBatch(image_features_list,
                         batch_size=batch_size,
                         splits=train_splits,
                         tokenizer=tok)

    train_env.data.extend(hardNeg_train)  # extend train data and shuffle
    random.shuffle(train_env.data)

    enc_hidden_size = hidden_size // 2 if bidirectional else hidden_size
    glove = np.load(glove_path)
    feature_size = FEATURE_SIZE

    # =============================================================================
    #     visEncoder = try_cuda(CompatVisEncoderLSTM(
    #         action_embedding_size, feature_size, enc_hidden_size, dropout_ratio,
    #         bidirectional=bidirectional))
    # =============================================================================
    visEncoder = try_cuda(
        SpeakerEncoderLSTM(action_embedding_size,
                           feature_size,
                           enc_hidden_size,
                           dropout_ratio,
                           bidirectional=bidirectional))
    # =============================================================================
    #     lanEncoder = try_cuda(CompatLanEncoderLSTM(
    #         len(vocab), word_embedding_size, enc_hidden_size, vocab_pad_idx,
    #         dropout_ratio, bidirectional=True, glove=glove))
    # =============================================================================
    lanEncoder = try_cuda(
        EncoderLSTM(len(vocab),
                    word_embedding_size,
                    enc_hidden_size,
                    vocab_pad_idx,
                    dropout_ratio,
                    bidirectional=False,
                    glove=glove))
    dotSim = try_cuda(dotSimilarity(batch_size, enc_hidden_size))
    #visEncoder.load_state_dict(torch.load('tasks/R2R/snapshots/release/speaker_final_release_enc'))
    #lanEncoder.load_state_dict(torch.load('tasks/R2R/snapshots/release/follower_final_release_enc'))

    test_envs = {
        split: (R2RBatch(image_features_list,
                         batch_size=batch_size,
                         splits=[split],
                         tokenizer=tok,
                         instruction_limit=test_instruction_limit),
                eval_speaker.SpeakerEvaluation(
                    [split], instructions_per_path=test_instruction_limit))
        for split in test_splits
    }

    #test_envs['val_seen'][0].data.extend(hardNeg_val_seen)
    test_envs['val_unseen'][0].data.extend(hardNeg_val_unseen)
    test_envs['val_unseen'][0].data = test_envs['val_unseen'][0].data[
        3000:4000]
    return train_env, test_envs, visEncoder, lanEncoder, dotSim