def train(label_set='full', pool_mode='sum', layer_sizes=[512, 256], activation='prelu', drop_unk=False, word_vecs=None, return_net=False, cv=10, val_split=0.00, label_unk=False): print "Loading data..." df = sentences_df(SENTENCES_CSV, labels=label_set, drop_unk=drop_unk) X, y, word2idx, l_enc = load_dataset(df, ngram_order=1, pad=True) print "X shape:", X.shape y_orig = y y_binary = to_categorical(y) labels = np.unique(y_orig) nb_labels = labels.shape[0] if drop_unk: label_set_str = label_set + ' (-unk)' else: label_set_str = label_set print "Number of labels: %i [%s]" % (nb_labels, label_set_str) if nb_labels > 2: y = y_binary maxlen = X.shape[1] vocab_size = len(word2idx) + 1 # 0 masking word_vectors = load_bin_vec(word_vecs, word2idx) add_unknown_words(word_vectors, word2idx) embedding_weights = np.zeros((vocab_size+1, emb_dim)) for word, index in word2idx.items(): embedding_weights[index,:] = word_vectors[word]
def main(rnn_layer='lstm', word_vecs=None): print "Loading data...", df = sentences_df(SENTENCES_CSV) X, y, word2idx, l_enc = load_dataset(df, pad=True) print X.shape y_binary = to_categorical(y) word_vectors = load_bin_vec(word_vecs, word2idx) add_unknown_words(word_vectors, word2idx) print "Data loaded." labels = np.unique(y) n_labels = labels.shape[0] max_len = X.shape[1] vocab_dim = 300 n_vocab = len(word2idx) + 1 # 0 masking embedding_weights = np.zeros((n_vocab+1, vocab_dim)) for word, index in word2idx.items(): embedding_weights[index,:] = word_vectors[word] skf = StratifiedKFold(y, n_folds=10, shuffle=True, random_state=0) cv_scores = [] for i, (train, test) in enumerate(skf): start_time = time.time() model = create_model(n_vocab, n_labels, vocab_dim, embedding_weights, rnn_layer=rnn_layer) if i == 0: print_summary(model.layers) _, score = train_and_test_model(model, X[train], y_binary[train], X[test], y_binary[test]) cv_scores.append(score) train_time = time.time() - start_time print "fold %i/10 - time: %.2f s - acc: %.4f on %i samples" % \ (i+1, train_time, score, len(test)) print "avg cv acc: %.4f" % np.mean(cv_scores)
def build_model(maxlen, vocab_size, word2idx, embeddings_path, nb_labels, sent_model='lstm', ffweights=None): # load word embeddings word_vectors = load_bin_vec(embeddings_path, word2idx) add_unknown_words(word_vectors, word2idx) embedding_weights = np.zeros((vocab_size + 1, 300)) for word, index in word2idx.items(): embedding_weights[index, :] = word_vectors[word] prem = Input(shape=(maxlen, ), dtype='int32') hyp = Input(shape=(maxlen, ), dtype='int32') if sent_model == 'lstm': emb = Embedding(input_dim=vocab_size + 1, output_dim=300, input_length=maxlen, weights=[embedding_weights]) prem_emb = emb(prem) hyp_emb = emb(hyp) encoder = LSTM(100, activation='relu', return_sequences=False) prem_enc = encoder(prem_emb) hyp_enc = encoder(hyp_emb) else: sentence_input = Input(shape=(maxlen, ), dtype='int32') x = Embedding(input_dim=vocab_size + 1, output_dim=300, input_length=maxlen, weights=[embedding_weights]) x = x(sentence_input) if ffweights: model.load_weights(ffweights) prem_emb = x(prem) hyp_emb = x(hyp) prem_enc = ff(prem_emb) hyp_enc = ff(hyp_emb) merged = merge([prem_enc, hyp_enc], mode='concat', concat_axis=-1) dropout = Dropout(0.5)(merged) fc1 = Dense(200, activation='relu')(merged_sents) fc2 = Dense(200, activation='relu')(fc1) fc3 = Dense(200, activation='relu')(fc2) probas = Dense(nb_labels, activation='softmax')(fc3) model = Model(input=[prem, hyp], output=probas) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) return model
def train(label_set='full', drop_unk=False, word_vecs=None, setup_only=False, layer_sizes=[512,256], pool_mode='sum'): print "Loading data..." df = sentences_df(SENTENCES_CSV, labels=label_set, drop_unk=drop_unk) X, y, word2idx, l_enc = load_dataset(df, pad=True) print "X shape:", X.shape y_orig = y y_binary = to_categorical(y) labels = np.unique(y_orig) nb_labels = labels.shape[0] if drop_unk: label_set_str = label_set + ' (-unk)' else: label_set_str = label_set print "Number of labels: %i [%s]" % (nb_labels, label_set_str) if nb_labels > 2: y = y_binary maxlen = X.shape[1] vocab_size = len(word2idx) + 1 # 0 masking if pretrained_embeddings is True: word_vectors = load_bin_vec(word_vecs, word2idx) add_unknown_words(word_vectors, word2idx) embedding_weights = np.zeros((vocab_size+1, emb_dim)) for word, index in word2idx.items(): embedding_weights[index,:] = word_vectors[word] else: embedding_weights = None print "Data loaded." skf = StratifiedKFold(y_orig, n_folds=10, shuffle=True, random_state=0) cv_scores = [] for i, (train, test) in enumerate(skf): start_time = time.time() nn = None nn = EnsembleNN(vocab_size, nb_labels, emb_dim, maxlen, embedding_weights, filter_hs, nb_filters, dropout_p, trainable_embeddings, pretrained_embeddings, layer_sizes, pool_mode) if i == 0: print_summary(nn.model.layers) acc = train_and_test_model(nn, X[train], y[train], X[test], y[test], batch_size, nb_epoch, lr, beta_1, beta_2, epsilon) cv_scores.append(acc) train_time = time.time() - start_time print('\nLabel frequencies in y[test]') print_label_frequencies((y_orig[test], l_enc)) y_pred = nn.model.predict(X[test]) y_pred = probas_to_classes(y_pred) c = Counter(y_pred) total = float(len(y_pred)) print('\nLabel frequencies in predict(y[test])') for label, count in c.most_common(): print l_enc.inverse_transform(label), count, count / total print "fold %i/10 - time: %.2f s - acc: %.4f on %i samples" % \ (i+1, train_time, acc, len(test)) print "Avg cv accuracy: %.4f" % np.mean(cv_scores)
def setup_generation(word_vecs, model_weights=None, scene_model=False): with open('../../models/l_enc.pkl') as p: l_enc = pickle.load(p) with open('../../models/word2idx.json') as j: word2idx = json.load(j) vocab_size = len(word2idx) + 1 max_caption_len = 40 word_vectors = load_bin_vec(word_vecs, word2idx) add_unknown_words(word_vectors, word2idx) embedding_weights = np.zeros((vocab_size,300)) for word, index in word2idx.items(): embedding_weights[index:,] = word_vectors[word] model = build_model(vocab_size, max_caption_len, '../../vgg16_weights.h5', embedding_weights, scene_model=scene_model) if model_weights: model.load_weights(model_weights) return model, word2idx, l_enc
def setup_generation(word_vecs, model_weights=None, scene_model=False): with open('../../models/l_enc.pkl') as p: l_enc = pickle.load(p) with open('../../models/word2idx.json') as j: word2idx = json.load(j) vocab_size = len(word2idx) + 1 max_caption_len = 40 word_vectors = load_bin_vec(word_vecs, word2idx) add_unknown_words(word_vectors, word2idx) embedding_weights = np.zeros((vocab_size, 300)) for word, index in word2idx.items(): embedding_weights[index:, ] = word_vectors[word] model = build_model(vocab_size, max_caption_len, '../../vgg16_weights.h5', embedding_weights, scene_model=scene_model) if model_weights: model.load_weights(model_weights) return model, word2idx, l_enc
def main(rnn_layer='lstm', word_vecs=None): print "Loading data...", df = sentences_df(SENTENCES_CSV) X, y, word2idx, l_enc = load_dataset(df, pad=True) print X.shape y_binary = to_categorical(y) word_vectors = load_bin_vec(word_vecs, word2idx) add_unknown_words(word_vectors, word2idx) print "Data loaded." labels = np.unique(y) n_labels = labels.shape[0] max_len = X.shape[1] vocab_dim = 300 n_vocab = len(word2idx) + 1 # 0 masking embedding_weights = np.zeros((n_vocab + 1, vocab_dim)) for word, index in word2idx.items(): embedding_weights[index, :] = word_vectors[word] skf = StratifiedKFold(y, n_folds=10, shuffle=True, random_state=0) cv_scores = [] for i, (train, test) in enumerate(skf): start_time = time.time() model = create_model(n_vocab, n_labels, vocab_dim, embedding_weights, rnn_layer=rnn_layer) if i == 0: print_summary(model.layers) _, score = train_and_test_model(model, X[train], y_binary[train], X[test], y_binary[test]) cv_scores.append(score) train_time = time.time() - start_time print "fold %i/10 - time: %.2f s - acc: %.4f on %i samples" % \ (i+1, train_time, score, len(test)) print "avg cv acc: %.4f" % np.mean(cv_scores)
partial_captions.append(indices[:j + 1]) next_words.append(indices[j + 1]) partial_captions = [ pc if len(pc) <= 40 else pc[:40] for pc in partial_captions ] partial_captions = pad_sequences(partial_captions, padding='post') print "pc shape", partial_captions.shape print "imfiles shape", len(imfiles) print "scenes shape", len(scenes) print "next_words shape", len(next_words) vocab_size = len(word2idx) + 1 max_caption_len = 40 word_vectors = load_bin_vec(word_vecs, word2idx) add_unknown_words(word_vectors, word2idx) embedding_weights = np.zeros((vocab_size, emb_dim)) for word, index in word2idx.items(): embedding_weights[index, :] = word_vectors[word] model = build_model(vocab_size, max_caption_len, vggweights_path, embedding_weights, scene_model=scene_model) if modelweights_path: model.load_weights(modelweights_path) for m in model.layers[0].layers: print_summary(m.layers)
def train(model_type='parallel', label_set='full', drop_unk=False, word_vecs=None, setup_only=False): print "Loading data..." df = sentences_df(SENTENCES_CSV, labels=label_set, drop_unk=drop_unk) X, y, word2idx, l_enc = load_dataset(df, pad=True) print "X shape:", X.shape y_orig = y y_binary = to_categorical(y) labels = np.unique(y_orig) nb_labels = labels.shape[0] if drop_unk: label_set_str = label_set + ' (-unk)' else: label_set_str = label_set print "Number of labels: %i [%s]" % (nb_labels, label_set_str) if nb_labels > 2: y = y_binary maxlen = X.shape[1] vocab_size = len(word2idx) + 1 # 0 masking if pretrained_embeddings is True: word_vectors = load_bin_vec(word_vecs, word2idx) add_unknown_words(word_vectors, word2idx) embedding_weights = np.zeros((vocab_size + 1, emb_dim)) for word, index in word2idx.items(): embedding_weights[index, :] = word_vectors[word] else: embedding_weights = None print "Data loaded." if setup_only: cnn = create_model(vocab_size, nb_labels, emb_dim, maxlen, embedding_weights, filter_hs, nb_filters, dropout_p, trainable_embeddings, pretrained_embeddings, model_type=model_type) return { 'X': X, 'y': y, 'word2idx': word2idx, 'l_enc': l_enc, 'y_binary': y_binary, 'labels': labels, 'nb_labels': nb_labels, 'maxlen': maxlen, 'emb_dim': emb_dim, 'vocab_size': vocab_size, 'embedding_weights': embedding_weights, 'cnn': cnn } params = [('filter_hs', filter_hs), ('nb_filters', nb_filters), ('dropout_p', dropout_p), ('trainable_embeddings', trainable_embeddings), ('pretrained_embeddings', pretrained_embeddings), ('batch_size', batch_size), ('nb_epoch', nb_epoch), ('lr', lr), ('beta_1', beta_1), ('beta_2', beta_2), ('epsilon', epsilon)] print "\nModel type: %s" % model_type for (name, value) in params: print name + ':', value skf = StratifiedKFold(y_orig, n_folds=10, shuffle=True, random_state=0) cv_scores = [] for i, (train, test) in enumerate(skf): start_time = time.time() cnn = None cnn = create_model(vocab_size, nb_labels, emb_dim, maxlen, embedding_weights, filter_hs, nb_filters, dropout_p, trainable_embeddings, pretrained_embeddings, model_type=model_type) if i == 0: print_summary(cnn.model.layers) acc = train_and_test_model(cnn, X[train], y[train], X[test], y[test], batch_size, nb_epoch, lr, beta_1, beta_2, epsilon) cv_scores.append(acc) train_time = time.time() - start_time print('\nLabel frequencies in y[test]') print_label_frequencies((y_orig[test], l_enc)) y_pred = cnn.model.predict(X[test]) y_pred = probas_to_classes(y_pred) c = Counter(y_pred) total = float(len(y_pred)) print('\nLabel frequencies in predict(y[test])') for label, count in c.most_common(): print l_enc.inverse_transform(label), count, count / total print "fold %i/10 - time: %.2f s - acc: %.4f on %i samples" % \ (i+1, train_time, acc, len(test)) print "Avg cv accuracy: %.4f" % np.mean(cv_scores)
ydf = df.copy() Xdf['sentence'] = Xdf['sentence'].apply(lambda x: ' '.join(x.split()[:-1])) ydf['sentence'] = ydf['sentence'].apply(lambda x: ' '.join(x.split()[1:])) X_sents, _, _, _ = load_dataset(Xdf, pad=True, word2idx=word2idx) y_sents, _, _, _ = load_dataset(ydf, pad=True, word2idx=word2idx) scene_labels = Xdf.label vocab_size = len(word2idx) + 1 max_caption_len = X_sents.shape[1] ims_train, ims_test = ic[:4000], ic[4000:] X_sents_train, y_sents_train = X_sents[:20000], y_sents[:20000] X_sents_test, y_sents_test = X_sents[20000:], y_sents[20000:] word_vectors = load_bin_vec(word_vecs, word2idx) add_unknown_words(word_vectors, word2idx) embedding_weights = np.zeros((vocab_size, emb_dim)) for word, index in word2idx.items(): embedding_weights[index,:] = word_vectors[word] model = load_vgg_weights(build_model(vocab_size, max_caption_len, emb_dim, embedding_weights), weights_path) print "VGG 16 weights loaded." print_summary(model.layers) samples_per_epoch = X_sents_train.shape[0]
def train(model_type='parallel', label_set='full', drop_unk=False, word_vecs=None, setup_only=False): print "Loading data..." df = sentences_df(SENTENCES_CSV, labels=label_set, drop_unk=drop_unk) X, y, word2idx, l_enc = load_dataset(df, pad=True) print "X shape:", X.shape y_orig = y y_binary = to_categorical(y) labels = np.unique(y_orig) nb_labels = labels.shape[0] if drop_unk: label_set_str = label_set + ' (-unk)' else: label_set_str = label_set print "Number of labels: %i [%s]" % (nb_labels, label_set_str) if nb_labels > 2: y = y_binary maxlen = X.shape[1] vocab_size = len(word2idx) + 1 # 0 masking if pretrained_embeddings is True: word_vectors = load_bin_vec(word_vecs, word2idx) add_unknown_words(word_vectors, word2idx) embedding_weights = np.zeros((vocab_size+1, emb_dim)) for word, index in word2idx.items(): embedding_weights[index,:] = word_vectors[word] else: embedding_weights = None print "Data loaded." if setup_only: cnn = create_model(vocab_size, nb_labels, emb_dim, maxlen, embedding_weights, filter_hs, nb_filters, dropout_p, trainable_embeddings, pretrained_embeddings, model_type=model_type) return {'X': X, 'y': y, 'word2idx': word2idx, 'l_enc': l_enc, 'y_binary': y_binary, 'labels': labels, 'nb_labels': nb_labels, 'maxlen': maxlen, 'emb_dim': emb_dim, 'vocab_size': vocab_size, 'embedding_weights': embedding_weights, 'cnn': cnn} params = [('filter_hs',filter_hs), ('nb_filters',nb_filters), ('dropout_p',dropout_p), ('trainable_embeddings',trainable_embeddings), ('pretrained_embeddings',pretrained_embeddings), ('batch_size',batch_size), ('nb_epoch',nb_epoch), ('lr',lr), ('beta_1',beta_1), ('beta_2',beta_2), ('epsilon',epsilon)] print "\nModel type: %s" % model_type for (name, value) in params: print name + ':', value skf = StratifiedKFold(y_orig, n_folds=10, shuffle=True, random_state=0) cv_scores = [] for i, (train, test) in enumerate(skf): start_time = time.time() cnn = None cnn = create_model(vocab_size, nb_labels, emb_dim, maxlen, embedding_weights, filter_hs, nb_filters, dropout_p, trainable_embeddings, pretrained_embeddings, model_type=model_type) if i == 0: print_summary(cnn.model.layers) acc = train_and_test_model(cnn, X[train], y[train], X[test], y[test], batch_size, nb_epoch, lr, beta_1, beta_2, epsilon) cv_scores.append(acc) train_time = time.time() - start_time print('\nLabel frequencies in y[test]') print_label_frequencies((y_orig[test], l_enc)) y_pred = cnn.model.predict(X[test]) y_pred = probas_to_classes(y_pred) c = Counter(y_pred) total = float(len(y_pred)) print('\nLabel frequencies in predict(y[test])') for label, count in c.most_common(): print l_enc.inverse_transform(label), count, count / total print "fold %i/10 - time: %.2f s - acc: %.4f on %i samples" % \ (i+1, train_time, acc, len(test)) print "Avg cv accuracy: %.4f" % np.mean(cv_scores)