def make_env_and_models(args, train_vocab_path, train_splits, test_splits, test_instruction_limit=None): setup() image_features_list = ImageFeatures.from_args(args) vocab = read_vocab(train_vocab_path) tok = Tokenizer(vocab=vocab) train_env = R2RBatch(image_features_list, batch_size=batch_size, splits=train_splits, tokenizer=tok) enc_hidden_size = hidden_size//2 if bidirectional else hidden_size glove = np.load(glove_path) feature_size = FEATURE_SIZE encoder = try_cuda(SpeakerEncoderLSTM( action_embedding_size, feature_size, enc_hidden_size, dropout_ratio, bidirectional=bidirectional)) decoder = try_cuda(SpeakerDecoderLSTM( len(vocab), word_embedding_size, hidden_size, dropout_ratio, glove=glove)) test_envs = { split: (R2RBatch(image_features_list, batch_size=batch_size, splits=[split], tokenizer=tok, instruction_limit=test_instruction_limit), eval_speaker.SpeakerEvaluation( [split], instructions_per_path=test_instruction_limit)) for split in test_splits} return train_env, test_envs, encoder, decoder
def make_env_and_models(args, train_vocab_path, train_splits, test_splits): setup(args.seed) image_features_list = ImageFeatures.from_args(args) if args.job == None: # create vocab only during training (job == none) vocab = build_vocab(train_splits) write_vocab(vocab, TRAIN_VOCAB) vocab = read_vocab(train_vocab_path) tok = Tokenizer(vocab=vocab) train_env = R2RBatch(image_features_list, batch_size=args.batch_size, splits=train_splits, tokenizer=tok) if len(train_splits) > 0 else None test_envs = { split: (R2RBatch(image_features_list, batch_size=args.batch_size, splits=[split], tokenizer=tok), Evaluation(split, args.instrType)) for split in test_splits } agent = make_follower(args, vocab) agent.env = train_env if args.useObjLabelOrVis in ['label', 'both']: if not train_env is None: agent.pointer.wtoi = train_env.wtoi else: agent.pointer.wtoi = test_envs[test_splits[0]][0].wtoi return train_env, test_envs, agent
def make_env_and_models(args, train_vocab_path, train_splits, test_splits, batch_size=BATCH_SIZE): setup() image_features_list = ImageFeatures.from_args(args) vocab = read_vocab(train_vocab_path) tok = Tokenizer(vocab=vocab) train_env = R2RBatch(image_features_list, batch_size=batch_size, splits=train_splits, tokenizer=tok) enc_hidden_size = hidden_size//2 if args.bidirectional else hidden_size glove = np.load(glove_path) feature_size = FEATURE_SIZE encoder = try_cuda(EncoderLSTM( len(vocab), word_embedding_size, enc_hidden_size, vocab_pad_idx, dropout_ratio, bidirectional=args.bidirectional, glove=glove)) decoder = try_cuda(AttnDecoderLSTM( action_embedding_size, hidden_size, dropout_ratio, feature_size=feature_size)) test_envs = { split: (R2RBatch(image_features_list, batch_size=batch_size, splits=[split], tokenizer=tok), eval.Evaluation([split])) for split in test_splits} return train_env, test_envs, encoder, decoder
def make_more_train_env(args, train_vocab_path, train_splits): setup(args.seed) image_features_list = ImageFeatures.from_args(args) vocab = read_vocab(train_vocab_path) tok = Tokenizer(vocab=vocab) train_env = R2RBatch(image_features_list, batch_size=args.batch_size, splits=train_splits, tokenizer=tok) return train_env
def eval_simple_agents(args): ''' Run simple baselines on each split. ''' img_features = ImageFeatures.from_args(args) for split in ['train', 'val_seen', 'val_unseen', 'test']: env = R2RBatch(img_features, batch_size=1, splits=[split]) ev = Evaluation([split]) for agent_type in ['Stop', 'Shortest', 'Random']: outfile = '%s%s_%s_agent.json' % (train.RESULT_DIR, split, agent_type.lower()) agent = BaseAgent.get_agent(agent_type)(env, outfile) agent.test() agent.write_results() score_summary, _ = ev.score_file(outfile) print('\n%s' % agent_type) pp.pprint(score_summary)
def make_env_and_models(args, train_vocab_path, train_splits, test_splits): setup(args.seed) image_features_list = ImageFeatures.from_args(args) vocab = read_vocab(train_vocab_path) tok = Tokenizer(vocab=vocab) train_env = R2RBatch(image_features_list, batch_size=args.batch_size, splits=train_splits, tokenizer=tok) if len(train_splits) > 0 else None test_envs = { split: (R2RBatch(image_features_list, batch_size=args.batch_size, splits=[split], tokenizer=tok), eval.Evaluation([split])) for split in test_splits} agent = make_follower(args, vocab) agent.env = train_env return train_env, test_envs, agent
from vocab import SUBTRAIN_VOCAB, TRAIN_VOCAB, TRAINVAL_VOCAB import argparse parser = argparse.ArgumentParser() from env import ImageFeatures ImageFeatures.add_args(parser) import utils from utils import read_vocab, Tokenizer, timeSince, try_cuda, vocab_pad_idx from env import R2RBatch, ImageFeatures from vocab import SUBTRAIN_VOCAB, TRAIN_VOCAB, TRAINVAL_VOCAB from model import CompatLanEncoderLSTM, CompatVisEncoderLSTM, dotSimilarity, EncoderLSTM,SpeakerEncoderLSTM from compatModel import compatModel args, _ = parser.parse_known_args() image_features_list= ImageFeatures.from_args(args) angle_inc = np.pi / 6. def build_viewpoint_loc_embedding(viewIndex): """ Position embedding: heading 64D + elevation 64D 1) heading: [sin(heading) for _ in range(1, 33)] + [cos(heading) for _ in range(1, 33)] 2) elevation: [sin(elevation) for _ in range(1, 33)] + [cos(elevation) for _ in range(1, 33)] """ embedding = np.zeros((36, 128), np.float32) for absViewIndex in range(36): relViewIndex = (absViewIndex - viewIndex) % 12 + (absViewIndex // 12) * 12 rel_heading = (relViewIndex % 12) * angle_inc rel_elevation = (relViewIndex // 12 - 1) * angle_inc
def make_env_and_models(args, train_vocab_path, train_splits, test_splits, test_instruction_limit=None): setup() image_features_list = ImageFeatures.from_args(args) vocab = read_vocab(train_vocab_path) tok = Tokenizer(vocab=vocab) train_env = R2RBatch(image_features_list, batch_size=batch_size, splits=train_splits, tokenizer=tok) train_env.data.extend(hardNeg_train) # extend train data and shuffle random.shuffle(train_env.data) enc_hidden_size = hidden_size // 2 if bidirectional else hidden_size glove = np.load(glove_path) feature_size = FEATURE_SIZE # ============================================================================= # visEncoder = try_cuda(CompatVisEncoderLSTM( # action_embedding_size, feature_size, enc_hidden_size, dropout_ratio, # bidirectional=bidirectional)) # ============================================================================= visEncoder = try_cuda( SpeakerEncoderLSTM(action_embedding_size, feature_size, enc_hidden_size, dropout_ratio, bidirectional=bidirectional)) # ============================================================================= # lanEncoder = try_cuda(CompatLanEncoderLSTM( # len(vocab), word_embedding_size, enc_hidden_size, vocab_pad_idx, # dropout_ratio, bidirectional=True, glove=glove)) # ============================================================================= lanEncoder = try_cuda( EncoderLSTM(len(vocab), word_embedding_size, enc_hidden_size, vocab_pad_idx, dropout_ratio, bidirectional=False, glove=glove)) dotSim = try_cuda(dotSimilarity(batch_size, enc_hidden_size)) #visEncoder.load_state_dict(torch.load('tasks/R2R/snapshots/release/speaker_final_release_enc')) #lanEncoder.load_state_dict(torch.load('tasks/R2R/snapshots/release/follower_final_release_enc')) test_envs = { split: (R2RBatch(image_features_list, batch_size=batch_size, splits=[split], tokenizer=tok, instruction_limit=test_instruction_limit), eval_speaker.SpeakerEvaluation( [split], instructions_per_path=test_instruction_limit)) for split in test_splits } #test_envs['val_seen'][0].data.extend(hardNeg_val_seen) test_envs['val_unseen'][0].data.extend(hardNeg_val_unseen) test_envs['val_unseen'][0].data = test_envs['val_unseen'][0].data[ 3000:4000] return train_env, test_envs, visEncoder, lanEncoder, dotSim