def main(args): eddl.download_flickr() epochs = 2 if args.small else 50 olength = 20 outvs = 2000 embdim = 32 # True: remove last layers and set new top = flatten # new input_size: [3, 256, 256] (from [224, 224, 3]) net = eddl.download_resnet18(True, [3, 256, 256]) lreshape = eddl.getLayer(net, "top") # create a new model from input output image_in = eddl.getLayer(net, "input") # Decoder ldecin = eddl.Input([outvs]) ldec = eddl.ReduceArgMax(ldecin, [0]) ldec = eddl.RandomUniform(eddl.Embedding(ldec, outvs, 1, embdim, True), -0.05, 0.05) ldec = eddl.Concat([ldec, lreshape]) layer = eddl.LSTM(ldec, 512, True) out = eddl.Softmax(eddl.Dense(layer, outvs)) eddl.setDecoder(ldecin) net = eddl.Model([image_in], [out]) # Build model eddl.build( net, eddl.adam(0.01), ["softmax_cross_entropy"], ["accuracy"], eddl.CS_GPU(mem=args.mem) if args.gpu else eddl.CS_CPU(mem=args.mem)) eddl.summary(net) # Load dataset x_train = Tensor.load("flickr_trX.bin", "bin") y_train = Tensor.load("flickr_trY.bin", "bin") if args.small: x_train = x_train.select([f"0:{2 * args.batch_size}", ":", ":", ":"]) y_train = y_train.select([f"0:{2 * args.batch_size}", ":"]) xtrain = Tensor.permute(x_train, [0, 3, 1, 2]) y_train = Tensor.onehot(y_train, outvs) # batch x timesteps x input_dim y_train.reshape_([y_train.shape[0], olength, outvs]) eddl.fit(net, [xtrain], [y_train], args.batch_size, epochs) eddl.save(net, "img2text.bin", "bin") print("\n === INFERENCE ===\n") # Get all the reshapes of the images. Only use the CNN timage = Tensor([x_train.shape[0], 512]) # images reshape cnn = eddl.Model([image_in], [lreshape]) eddl.build( cnn, eddl.adam(0.001), # not relevant ["mse"], # not relevant ["mse"], # not relevant eddl.CS_GPU(mem=args.mem) if args.gpu else eddl.CS_CPU(mem=args.mem)) eddl.summary(cnn) # forward images xbatch = Tensor([args.batch_size, 3, 256, 256]) # numbatches = x_train.shape[0] / args.batch_size j = 0 eddl.next_batch([x_train], [xbatch]) eddl.forward(cnn, [xbatch]) ybatch = eddl.getOutput(lreshape) sample = str(j * args.batch_size) + ":" + str((j + 1) * args.batch_size) timage.set_select([sample, ":"], ybatch) # Create Decoder non recurrent for n-best ldecin = eddl.Input([outvs]) image = eddl.Input([512]) lstate = eddl.States([2, 512]) ldec = eddl.ReduceArgMax(ldecin, [0]) ldec = eddl.RandomUniform(eddl.Embedding(ldec, outvs, 1, embdim), -0.05, 0.05) ldec = eddl.Concat([ldec, image]) lstm = eddl.LSTM([ldec, lstate], 512, True) lstm.isrecurrent = False # Important out = eddl.Softmax(eddl.Dense(lstm, outvs)) decoder = eddl.Model([ldecin, image, lstate], [out]) eddl.build( decoder, eddl.adam(0.001), # not relevant ["softmax_cross_entropy"], # not relevant ["accuracy"], # not relevant eddl.CS_GPU(mem=args.mem) if args.gpu else eddl.CS_CPU(mem=args.mem)) eddl.summary(decoder) # Copy params from trained net eddl.copyParam(eddl.getLayer(net, "LSTM1"), eddl.getLayer(decoder, "LSTM2")) eddl.copyParam(eddl.getLayer(net, "dense1"), eddl.getLayer(decoder, "dense2")) eddl.copyParam(eddl.getLayer(net, "embedding1"), eddl.getLayer(decoder, "embedding2")) # N-best for sample s s = 1 if args.small else 100 # sample 100 # three input tensors with batch_size = 1 (one sentence) treshape = timage.select([str(s), ":"]) text = y_train.select([str(s), ":", ":"]) # 1 x olength x outvs for j in range(olength): print(f"Word: {j}") word = None if j == 0: word = Tensor.zeros([1, outvs]) else: word = text.select(["0", str(j - 1), ":"]) word.reshape_([1, outvs]) # batch = 1 treshape.reshape_([1, 512]) # batch = 1 state = Tensor.zeros([1, 2, 512]) # batch = 1 input_ = [word, treshape, state] eddl.forward(decoder, input_) # outword = eddl.getOutput(out) vstates = eddl.getStates(lstm) for i in range(len(vstates)): vstates[i].reshape_([1, 1, 512]) state.set_select([":", str(i), ":"], vstates[i]) print("All done")
def main(args): eddl.download_eutrans() epochs = 1 if args.small else 5 ilength = 30 olength = 30 invs = 687 outvs = 514 embedding = 64 # Encoder in_ = eddl.Input([1]) # 1 word layer = in_ lE = eddl.RandomUniform( eddl.Embedding(layer, invs, 1, embedding, True), -0.05, 0.05 ) enc = eddl.LSTM(lE, 128, True) cps = eddl.GetStates(enc) # Decoder ldin = eddl.Input([outvs]) ld = eddl.ReduceArgMax(ldin, [0]) ld = eddl.RandomUniform( eddl.Embedding(ld, outvs, 1, embedding), -0.05, 0.05 ) layer = eddl.LSTM([ld, cps], 128) out = eddl.Softmax(eddl.Dense(layer, outvs)) eddl.setDecoder(ldin) net = eddl.Model([in_], [out]) # Build model eddl.build( net, eddl.adam(0.01), ["softmax_cross_entropy"], ["accuracy"], eddl.CS_GPU(mem=args.mem) if args.gpu else eddl.CS_CPU(mem=args.mem) ) eddl.summary(net) # Load dataset x_train = Tensor.load("eutrans_trX.bin") y_train = Tensor.load("eutrans_trY.bin") y_train = Tensor.onehot(y_train, outvs) # batch x timesteps x input_dim x_train.reshape_([x_train.shape[0], ilength, 1]) # batch x timesteps x ouput_dim y_train.reshape_([y_train.shape[0], olength, outvs]) x_test = Tensor.load("eutrans_tsX.bin") y_test = Tensor.load("eutrans_tsY.bin") y_test = Tensor.onehot(y_test, outvs) # batch x timesteps x input_dim x_test.reshape_([x_test.shape[0], ilength, 1]) # batch x timesteps x ouput_dim y_test.reshape_([y_test.shape[0], olength, outvs]) if args.small: sel = [f":{3 * args.batch_size}", ":", ":"] x_train = x_train.select(sel) y_train = y_train.select(sel) x_test = x_test.select(sel) y_test = y_test.select(sel) # Train model ybatch = Tensor([args.batch_size, olength, outvs]) eddl.next_batch([y_train], [ybatch]) for i in range(epochs): eddl.fit(net, [x_train], [y_train], args.batch_size, 1) print("All done")