Example #1
0
def main():
    import vg.flickr8k_provider as dp_f
    import vg.simple_data as sd
    batch_size = 16
    prov_flickr = dp_f.getDataProvider('flickr8k', root='.', audio_kind='mfcc')
    data_flickr = sd.SimpleData(prov_flickr,
                                tokenize=sd.characters,
                                min_df=1,
                                scale=False,
                                batch_size=batch_size,
                                shuffle=False)
    net = torch.load(
        "experiments/s2-t.-s2i2-s2t.-t2s.-s2d0-d1-embed-128-joint-e/model.23.pkl"
    )
    #net = torch.load("experiments/s2-t1-s2i2-s2t0-t2s0-s2d0-d1-joint-f/model.19.pkl")
    net.SpeechTranscriber.TextDecoder.Decoder.RNN.flatten_parameters()
    net.SpeechTranscriber.SpeechEncoderBottom.RNN.flatten_parameters()
    #batches = data_flickr.iter_valid_batches()
    batches = data_flickr.iter_train_batches()
    first = next(batches)
    texts = list(data_flickr.mapper.inverse_transform(first['input']))
    args = net.SpeechTranscriber.args(first)
    args = [torch.autograd.Variable(torch.from_numpy(x)).cuda() for x in args]
    audio, target_t, target_prev_t = args
    for j in range(16):
        print(''.join(texts[j]))
        for seq in transcribe(net, audio[j:j + 1], K=5, maxlen=25):

            vals, ids = zip(*seq)

            chars = list(data_flickr.mapper.inverse_transform([ids]))[0]
            text = ''.join(
                ['_' if char == '<BEG>' else char for char in chars])
            print("{:.2f} {}".format(sum(vals), text))
        print()
Example #2
0
                    help='Test mode',
                    dest='testmode',
                    action='store_true',
                    default=False)
args = parser.parse_args()

# Setup test mode
if args.testmode:
    epochs = 1
    limit = 100

prov_flickr = dp_f.getDataProvider('flickr8k', root='..', audio_kind='mfcc')
data_flickr = sd.SimpleData(prov_flickr,
                            tokenize=sd.characters,
                            min_df=1,
                            scale=False,
                            batch_size=batch_size,
                            shuffle=True,
                            limit=limit,
                            limit_val=limit)


def get_audio(sent):
    return sent['audio']


scorer = vg.scorer.ScorerASR(
    prov_flickr,
    dict(split='val',
         tokenize=get_audio,
         batch_size=batch_size,
         limit=limit,
Example #3
0
seed = 101
random.seed(seed)
numpy.random.seed(seed)

import vg.simple_data as sd
import vg.flickr8k_provider as dp_f
import vg.libri_provider as dp_l
import vg.defn.three_way2 as D
import vg.scorer

batch_size = 16
epochs=25

prov_flickr = dp_f.getDataProvider('flickr8k', root='../..', audio_kind='mfcc')
prov_libri = dp_l.getDataProvider('libri', root='../..', audio_kind='mfcc')
data_flickr = sd.SimpleData(prov_flickr, tokenize=sd.characters, min_df=1, scale=False,
                            batch_size=batch_size, shuffle=True)
data_libri  = sd.SimpleData(prov_libri, tokenize=sd.characters, min_df=1, scale=False,
                            batch_size=batch_size, shuffle=True)
model_config = dict(TextImage=dict(ImageEncoder=dict(size=1024, size_target=4096),
                                   lr=0.0002,
                                   margin_size=0.2,
                                   max_norm=2.0, 
                                   TextEncoderTop=dict(size=1024, size_feature=1024, depth=1, size_attn=128)),
                    SpeechImage=dict(ImageEncoder=dict(size=1024, size_target=4096),
                                     lr=0.0002,
                                     margin_size=0.2,
                                     max_norm=2.0, 
                                     SpeechEncoderTop=dict(size=1024, size_input=1024, depth=1, size_attn=128)),
                    SpeechText=dict(TextEncoderTop=dict(size_feature=1024,
                                                        size=1024,
                                                        depth=1,
Example #4
0
random.seed(seed)
numpy.random.seed(seed)

import vg.simple_data as sd
import vg.experiment as E
import vg.data_provider as dp
import vg.defn.audiovis_rhn as D
dataset = 'flickr8k'
batch_size = 32
epochs = 15
prov = dp.getDataProvider(dataset,
                          root='../..',
                          audio_kind='human.max1K.accel3.ord.mfcc')
data = sd.SimpleData(prov,
                     min_df=10,
                     scale=False,
                     batch_size=batch_size,
                     shuffle=True)
model_config = dict(size=1024,
                    depth=4,
                    recur_depth=2,
                    max_norm=2.0,
                    residual=True,
                    drop_i=0.25,
                    drop_s=0.1,
                    lr=0.0002,
                    size_vocab=37,
                    size_target=4096,
                    filter_length=6,
                    filter_size=64,
                    stride=2,
Example #5
0
numpy.random.seed(seed)

import vg.simple_data as sd
import vg.places_provider as dp

import vg.defn.three_way_stack as D
import vg.scorer

batch_size = 16
epochs = 25

prov_places = dp.getDataProvider('places', root='../..', audio_kind='mfcc')

data_places = sd.SimpleData(prov_places,
                            tokenize=sd.characters,
                            min_df=1,
                            scale=False,
                            batch_size=batch_size,
                            shuffle=True)

model_config = dict(
    SpeechImage=dict(ImageEncoder=dict(size=1024, size_target=4096),
                     lr=0.0002,
                     margin_size=0.2,
                     max_norm=2.0,
                     SpeechEncoderTop=dict(size=1024,
                                           size_input=1024,
                                           depth=2,
                                           size_attn=128)),
    SpeechEncoderBottom=dict(size=1024,
                             depth=2,
                             size_vocab=13,
Example #6
0
seed = 1235
random.seed(seed)
numpy.random.seed(seed)

import vg.simple_data as sd
import vg.experiment as E
import vg.vendrov_provider as dp
import vg.defn.segmatch as D
dataset = 'coco'
batch_size = 32
epochs = 15
prov = dp.getDataProvider(dataset, root='../..', audio_kind='mfcc')
data = sd.SimpleData(prov,
                     min_df=10,
                     scale=False,
                     batch_size=batch_size,
                     shuffle=True,
                     erasure=15,
                     limit=5000)
print("Loaded data")
model_config = dict(size=512,
                    depth=5,
                    max_norm=2.0,
                    residual=True,
                    lr=0.0002,
                    size_vocab=13,
                    size_target=512,
                    filter_length=6,
                    filter_size=64,
                    stride=3,
                    contrastive=True,