def get_nets(): import vg.defn.three_way2 as D net_base = load_best_run('{}/s2-t.-s2i2-s2t.-t2s.-t2i.'.format(PREFIX), cond='') net_mt = load_best_run('{}/s2-t1-s2i2-s2t0-t2s0-t2i1'.format(PREFIX), cond='joint') config = dict(TextImage=dict(ImageEncoder=dict(size=1024, size_target=4096), lr=0.0002, margin_size=0.2, max_norm=2.0, TextEncoderTop=dict(size=1024, size_feature=1024, depth=1, size_attn=128)), SpeechImage=dict(ImageEncoder=dict(size=1024, size_target=4096), lr=0.0002, margin_size=0.2, max_norm=2.0, SpeechEncoderTop=dict(size=1024, size_input=1024, depth=2, size_attn=128)), SpeechText=dict(TextEncoderTop=dict(size_feature=1024, size=1024, depth=0, size_attn=128), SpeechEncoderTop=dict(size=1024, size_input=1024, depth=0, size_attn=128), lr=0.0002, margin_size=0.2, max_norm=2.0), SpeechEncoderBottom=dict(size=1024, depth=2, size_vocab=13, filter_length=6, filter_size=64, stride=2), TextEncoderBottom=dict( size_feature=net_mt.TextEncoderBottom.size_feature, size_embed=128, size=1024, depth=1)) net_base = load_best_run('{}/s2-t.-s2i2-s2t.-t2s.-t2i.'.format(PREFIX), cond='') net_mt = load_best_run('{}/s2-t1-s2i2-s2t0-t2s0-t2i1'.format(PREFIX), cond='joint') net_init = D.Net(config).cuda() return [('m6_init', net_init), ('m1', net_base), ('m6', net_mt)]
SpeechEncoderBottom=dict(size=1024, depth=2, size_vocab=13, filter_length=6, filter_size=64, stride=2), TextEncoderBottom=dict(size_feature=data_flickr.mapper.size(), size_embed=128, size=1024, depth=1) ) def audio(sent): return sent['audio'] net = D.Net(model_config) net.batcher = None net.mapper = None scorer = vg.scorer.Scorer(prov_flickr, dict(split='val', tokenize=audio, batch_size=batch_size )) run_config = dict(epochs=epochs, validate_period=400, tasks=[ ('SpeechText', net.SpeechText), ('SpeechImage', net.SpeechImage), ('TextImage', net.TextImage)],