def main(): import vg.flickr8k_provider as dp_f import vg.simple_data as sd batch_size = 16 prov_flickr = dp_f.getDataProvider('flickr8k', root='.', audio_kind='mfcc') data_flickr = sd.SimpleData(prov_flickr, tokenize=sd.characters, min_df=1, scale=False, batch_size=batch_size, shuffle=False) net = torch.load( "experiments/s2-t.-s2i2-s2t.-t2s.-s2d0-d1-embed-128-joint-e/model.23.pkl" ) #net = torch.load("experiments/s2-t1-s2i2-s2t0-t2s0-s2d0-d1-joint-f/model.19.pkl") net.SpeechTranscriber.TextDecoder.Decoder.RNN.flatten_parameters() net.SpeechTranscriber.SpeechEncoderBottom.RNN.flatten_parameters() #batches = data_flickr.iter_valid_batches() batches = data_flickr.iter_train_batches() first = next(batches) texts = list(data_flickr.mapper.inverse_transform(first['input'])) args = net.SpeechTranscriber.args(first) args = [torch.autograd.Variable(torch.from_numpy(x)).cuda() for x in args] audio, target_t, target_prev_t = args for j in range(16): print(''.join(texts[j])) for seq in transcribe(net, audio[j:j + 1], K=5, maxlen=25): vals, ids = zip(*seq) chars = list(data_flickr.mapper.inverse_transform([ids]))[0] text = ''.join( ['_' if char == '<BEG>' else char for char in chars]) print("{:.2f} {}".format(sum(vals), text)) print()
help='Test mode', dest='testmode', action='store_true', default=False) args = parser.parse_args() # Setup test mode if args.testmode: epochs = 1 limit = 100 prov_flickr = dp_f.getDataProvider('flickr8k', root='..', audio_kind='mfcc') data_flickr = sd.SimpleData(prov_flickr, tokenize=sd.characters, min_df=1, scale=False, batch_size=batch_size, shuffle=True, limit=limit, limit_val=limit) def get_audio(sent): return sent['audio'] scorer = vg.scorer.ScorerASR( prov_flickr, dict(split='val', tokenize=get_audio, batch_size=batch_size, limit=limit,
seed = 101 random.seed(seed) numpy.random.seed(seed) import vg.simple_data as sd import vg.flickr8k_provider as dp_f import vg.libri_provider as dp_l import vg.defn.three_way2 as D import vg.scorer batch_size = 16 epochs=25 prov_flickr = dp_f.getDataProvider('flickr8k', root='../..', audio_kind='mfcc') prov_libri = dp_l.getDataProvider('libri', root='../..', audio_kind='mfcc') data_flickr = sd.SimpleData(prov_flickr, tokenize=sd.characters, min_df=1, scale=False, batch_size=batch_size, shuffle=True) data_libri = sd.SimpleData(prov_libri, tokenize=sd.characters, min_df=1, scale=False, batch_size=batch_size, shuffle=True) model_config = dict(TextImage=dict(ImageEncoder=dict(size=1024, size_target=4096), lr=0.0002, margin_size=0.2, max_norm=2.0, TextEncoderTop=dict(size=1024, size_feature=1024, depth=1, size_attn=128)), SpeechImage=dict(ImageEncoder=dict(size=1024, size_target=4096), lr=0.0002, margin_size=0.2, max_norm=2.0, SpeechEncoderTop=dict(size=1024, size_input=1024, depth=1, size_attn=128)), SpeechText=dict(TextEncoderTop=dict(size_feature=1024, size=1024, depth=1,
random.seed(seed) numpy.random.seed(seed) import vg.simple_data as sd import vg.experiment as E import vg.data_provider as dp import vg.defn.audiovis_rhn as D dataset = 'flickr8k' batch_size = 32 epochs = 15 prov = dp.getDataProvider(dataset, root='../..', audio_kind='human.max1K.accel3.ord.mfcc') data = sd.SimpleData(prov, min_df=10, scale=False, batch_size=batch_size, shuffle=True) model_config = dict(size=1024, depth=4, recur_depth=2, max_norm=2.0, residual=True, drop_i=0.25, drop_s=0.1, lr=0.0002, size_vocab=37, size_target=4096, filter_length=6, filter_size=64, stride=2,
numpy.random.seed(seed) import vg.simple_data as sd import vg.places_provider as dp import vg.defn.three_way_stack as D import vg.scorer batch_size = 16 epochs = 25 prov_places = dp.getDataProvider('places', root='../..', audio_kind='mfcc') data_places = sd.SimpleData(prov_places, tokenize=sd.characters, min_df=1, scale=False, batch_size=batch_size, shuffle=True) model_config = dict( SpeechImage=dict(ImageEncoder=dict(size=1024, size_target=4096), lr=0.0002, margin_size=0.2, max_norm=2.0, SpeechEncoderTop=dict(size=1024, size_input=1024, depth=2, size_attn=128)), SpeechEncoderBottom=dict(size=1024, depth=2, size_vocab=13,
seed = 1235 random.seed(seed) numpy.random.seed(seed) import vg.simple_data as sd import vg.experiment as E import vg.vendrov_provider as dp import vg.defn.segmatch as D dataset = 'coco' batch_size = 32 epochs = 15 prov = dp.getDataProvider(dataset, root='../..', audio_kind='mfcc') data = sd.SimpleData(prov, min_df=10, scale=False, batch_size=batch_size, shuffle=True, erasure=15, limit=5000) print("Loaded data") model_config = dict(size=512, depth=5, max_norm=2.0, residual=True, lr=0.0002, size_vocab=13, size_target=512, filter_length=6, filter_size=64, stride=3, contrastive=True,