Esempio n. 1
0
def train(label_set='full', pool_mode='sum', layer_sizes=[512, 256],
          activation='prelu', drop_unk=False, word_vecs=None,
          return_net=False, cv=10, val_split=0.00, label_unk=False):
    print "Loading data..."
    df = sentences_df(SENTENCES_CSV, labels=label_set, drop_unk=drop_unk)
    X, y, word2idx, l_enc = load_dataset(df, ngram_order=1, pad=True)
    print "X shape:", X.shape
    y_orig = y
    y_binary = to_categorical(y)
    labels = np.unique(y_orig)
    nb_labels = labels.shape[0]
    if drop_unk:
        label_set_str = label_set + ' (-unk)'
    else:
        label_set_str = label_set
    print "Number of labels: %i [%s]" % (nb_labels, label_set_str)
    if nb_labels > 2:
        y = y_binary
    maxlen = X.shape[1]
    vocab_size = len(word2idx) + 1 # 0 masking
    word_vectors = load_bin_vec(word_vecs, word2idx)
    add_unknown_words(word_vectors, word2idx)

    embedding_weights = np.zeros((vocab_size+1, emb_dim))
    for word, index in word2idx.items():
        embedding_weights[index,:] = word_vectors[word]
Esempio n. 2
0
def build_model(maxlen,
                vocab_size,
                word2idx,
                embeddings_path,
                nb_labels,
                sent_model='lstm',
                ffweights=None):
    # load word embeddings
    word_vectors = load_bin_vec(embeddings_path, word2idx)
    add_unknown_words(word_vectors, word2idx)
    embedding_weights = np.zeros((vocab_size + 1, 300))
    for word, index in word2idx.items():
        embedding_weights[index, :] = word_vectors[word]

    prem = Input(shape=(maxlen, ), dtype='int32')
    hyp = Input(shape=(maxlen, ), dtype='int32')

    if sent_model == 'lstm':
        emb = Embedding(input_dim=vocab_size + 1,
                        output_dim=300,
                        input_length=maxlen,
                        weights=[embedding_weights])
        prem_emb = emb(prem)
        hyp_emb = emb(hyp)

        encoder = LSTM(100, activation='relu', return_sequences=False)
        prem_enc = encoder(prem_emb)
        hyp_enc = encoder(hyp_emb)

    else:
        sentence_input = Input(shape=(maxlen, ), dtype='int32')
        x = Embedding(input_dim=vocab_size + 1,
                      output_dim=300,
                      input_length=maxlen,
                      weights=[embedding_weights])
        x = x(sentence_input)

        if ffweights:
            model.load_weights(ffweights)

        prem_emb = x(prem)
        hyp_emb = x(hyp)

        prem_enc = ff(prem_emb)
        hyp_enc = ff(hyp_emb)

    merged = merge([prem_enc, hyp_enc], mode='concat', concat_axis=-1)
    dropout = Dropout(0.5)(merged)

    fc1 = Dense(200, activation='relu')(merged_sents)
    fc2 = Dense(200, activation='relu')(fc1)
    fc3 = Dense(200, activation='relu')(fc2)
    probas = Dense(nb_labels, activation='softmax')(fc3)

    model = Model(input=[prem, hyp], output=probas)
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model
Esempio n. 3
0
def train(label_set='full', drop_unk=False,
          word_vecs=None, setup_only=False, layer_sizes=[512,256],
          pool_mode='sum'):
    print "Loading data..."
    df = sentences_df(SENTENCES_CSV, labels=label_set, drop_unk=drop_unk)
    X, y, word2idx, l_enc = load_dataset(df, pad=True)
    print "X shape:", X.shape
    y_orig = y
    y_binary = to_categorical(y)
    labels = np.unique(y_orig)
    nb_labels = labels.shape[0]
    if drop_unk:
        label_set_str = label_set + ' (-unk)'
    else:
        label_set_str = label_set
    print "Number of labels: %i [%s]" % (nb_labels, label_set_str)
    if nb_labels > 2:
        y = y_binary
    maxlen = X.shape[1]
    vocab_size = len(word2idx) + 1 # 0 masking
    if pretrained_embeddings is True:
        word_vectors = load_bin_vec(word_vecs, word2idx)
        add_unknown_words(word_vectors, word2idx)
        embedding_weights = np.zeros((vocab_size+1, emb_dim))
        for word, index in word2idx.items():
            embedding_weights[index,:] = word_vectors[word]
    else:
        embedding_weights = None
    print "Data loaded."

    skf = StratifiedKFold(y_orig, n_folds=10, shuffle=True, random_state=0)
    cv_scores = []
    for i, (train, test) in enumerate(skf):
        start_time = time.time()
        nn = None
        nn = EnsembleNN(vocab_size, nb_labels, emb_dim, maxlen,
                        embedding_weights, filter_hs, nb_filters,
                        dropout_p, trainable_embeddings, pretrained_embeddings,
                        layer_sizes, pool_mode)
        if i == 0:
            print_summary(nn.model.layers)
        acc = train_and_test_model(nn, X[train], y[train], X[test], y[test],
                                   batch_size, nb_epoch,
                                   lr, beta_1, beta_2, epsilon)
        cv_scores.append(acc)
        train_time = time.time() - start_time
        print('\nLabel frequencies in y[test]')
        print_label_frequencies((y_orig[test], l_enc))
        y_pred = nn.model.predict(X[test])
        y_pred = probas_to_classes(y_pred)
        c = Counter(y_pred)
        total = float(len(y_pred))
        print('\nLabel frequencies in predict(y[test])')
        for label, count in c.most_common():
            print l_enc.inverse_transform(label), count, count / total
        print "fold %i/10 - time: %.2f s - acc: %.4f on %i samples" % \
            (i+1, train_time, acc, len(test))
    print "Avg cv accuracy: %.4f" % np.mean(cv_scores)
Esempio n. 4
0
def setup_generation(word_vecs, model_weights=None, scene_model=False):
    with open('../../models/l_enc.pkl') as p:
        l_enc = pickle.load(p)
    with open('../../models/word2idx.json') as j:
        word2idx = json.load(j)
    vocab_size = len(word2idx) + 1
    max_caption_len = 40
    word_vectors = load_bin_vec(word_vecs, word2idx)
    add_unknown_words(word_vectors, word2idx)
    embedding_weights = np.zeros((vocab_size,300))
    for word, index in word2idx.items():
        embedding_weights[index:,] = word_vectors[word]
    model = build_model(vocab_size, max_caption_len, '../../vgg16_weights.h5',
                        embedding_weights, scene_model=scene_model)
    if model_weights:
        model.load_weights(model_weights)
    return model, word2idx, l_enc
Esempio n. 5
0
def setup_generation(word_vecs, model_weights=None, scene_model=False):
    with open('../../models/l_enc.pkl') as p:
        l_enc = pickle.load(p)
    with open('../../models/word2idx.json') as j:
        word2idx = json.load(j)
    vocab_size = len(word2idx) + 1
    max_caption_len = 40
    word_vectors = load_bin_vec(word_vecs, word2idx)
    add_unknown_words(word_vectors, word2idx)
    embedding_weights = np.zeros((vocab_size, 300))
    for word, index in word2idx.items():
        embedding_weights[index:, ] = word_vectors[word]
    model = build_model(vocab_size,
                        max_caption_len,
                        '../../vgg16_weights.h5',
                        embedding_weights,
                        scene_model=scene_model)
    if model_weights:
        model.load_weights(model_weights)
    return model, word2idx, l_enc
Esempio n. 6
0
            next_words.append(indices[j + 1])
    partial_captions = [
        pc if len(pc) <= 40 else pc[:40] for pc in partial_captions
    ]
    partial_captions = pad_sequences(partial_captions, padding='post')

    print "pc shape", partial_captions.shape
    print "imfiles shape", len(imfiles)
    print "scenes shape", len(scenes)
    print "next_words shape", len(next_words)

    vocab_size = len(word2idx) + 1
    max_caption_len = 40

    word_vectors = load_bin_vec(word_vecs, word2idx)
    add_unknown_words(word_vectors, word2idx)
    embedding_weights = np.zeros((vocab_size, emb_dim))
    for word, index in word2idx.items():
        embedding_weights[index, :] = word_vectors[word]

    model = build_model(vocab_size,
                        max_caption_len,
                        vggweights_path,
                        embedding_weights,
                        scene_model=scene_model)
    if modelweights_path:
        model.load_weights(modelweights_path)

    for m in model.layers[0].layers:
        print_summary(m.layers)
    print_summary(model.layers)
Esempio n. 7
0
def train(model_type='parallel',
          label_set='full',
          drop_unk=False,
          word_vecs=None,
          setup_only=False):
    print "Loading data..."
    df = sentences_df(SENTENCES_CSV, labels=label_set, drop_unk=drop_unk)
    X, y, word2idx, l_enc = load_dataset(df, pad=True)
    print "X shape:", X.shape
    y_orig = y
    y_binary = to_categorical(y)
    labels = np.unique(y_orig)
    nb_labels = labels.shape[0]
    if drop_unk:
        label_set_str = label_set + ' (-unk)'
    else:
        label_set_str = label_set
    print "Number of labels: %i [%s]" % (nb_labels, label_set_str)
    if nb_labels > 2:
        y = y_binary
    maxlen = X.shape[1]
    vocab_size = len(word2idx) + 1  # 0 masking
    if pretrained_embeddings is True:
        word_vectors = load_bin_vec(word_vecs, word2idx)
        add_unknown_words(word_vectors, word2idx)
        embedding_weights = np.zeros((vocab_size + 1, emb_dim))
        for word, index in word2idx.items():
            embedding_weights[index, :] = word_vectors[word]
    else:
        embedding_weights = None
    print "Data loaded."

    if setup_only:
        cnn = create_model(vocab_size,
                           nb_labels,
                           emb_dim,
                           maxlen,
                           embedding_weights,
                           filter_hs,
                           nb_filters,
                           dropout_p,
                           trainable_embeddings,
                           pretrained_embeddings,
                           model_type=model_type)
        return {
            'X': X,
            'y': y,
            'word2idx': word2idx,
            'l_enc': l_enc,
            'y_binary': y_binary,
            'labels': labels,
            'nb_labels': nb_labels,
            'maxlen': maxlen,
            'emb_dim': emb_dim,
            'vocab_size': vocab_size,
            'embedding_weights': embedding_weights,
            'cnn': cnn
        }

    params = [('filter_hs', filter_hs), ('nb_filters', nb_filters),
              ('dropout_p', dropout_p),
              ('trainable_embeddings', trainable_embeddings),
              ('pretrained_embeddings', pretrained_embeddings),
              ('batch_size', batch_size), ('nb_epoch', nb_epoch), ('lr', lr),
              ('beta_1', beta_1), ('beta_2', beta_2), ('epsilon', epsilon)]
    print "\nModel type: %s" % model_type
    for (name, value) in params:
        print name + ':', value

    skf = StratifiedKFold(y_orig, n_folds=10, shuffle=True, random_state=0)
    cv_scores = []
    for i, (train, test) in enumerate(skf):
        start_time = time.time()
        cnn = None
        cnn = create_model(vocab_size,
                           nb_labels,
                           emb_dim,
                           maxlen,
                           embedding_weights,
                           filter_hs,
                           nb_filters,
                           dropout_p,
                           trainable_embeddings,
                           pretrained_embeddings,
                           model_type=model_type)
        if i == 0:
            print_summary(cnn.model.layers)

        acc = train_and_test_model(cnn, X[train], y[train], X[test], y[test],
                                   batch_size, nb_epoch, lr, beta_1, beta_2,
                                   epsilon)
        cv_scores.append(acc)
        train_time = time.time() - start_time
        print('\nLabel frequencies in y[test]')
        print_label_frequencies((y_orig[test], l_enc))
        y_pred = cnn.model.predict(X[test])
        y_pred = probas_to_classes(y_pred)
        c = Counter(y_pred)
        total = float(len(y_pred))
        print('\nLabel frequencies in predict(y[test])')
        for label, count in c.most_common():
            print l_enc.inverse_transform(label), count, count / total
        print "fold %i/10 - time: %.2f s - acc: %.4f on %i samples" % \
            (i+1, train_time, acc, len(test))
    print "Avg cv accuracy: %.4f" % np.mean(cv_scores)
Esempio n. 8
0
    Xdf['sentence'] = Xdf['sentence'].apply(lambda x: ' '.join(x.split()[:-1]))
    ydf['sentence'] = ydf['sentence'].apply(lambda x: ' '.join(x.split()[1:]))
    X_sents, _, _, _ = load_dataset(Xdf, pad=True, word2idx=word2idx)
    y_sents, _, _, _ = load_dataset(ydf, pad=True, word2idx=word2idx)
    scene_labels = Xdf.label

    vocab_size = len(word2idx) + 1
    max_caption_len = X_sents.shape[1]

    ims_train, ims_test = ic[:4000], ic[4000:]
    X_sents_train, y_sents_train = X_sents[:20000], y_sents[:20000]
    X_sents_test, y_sents_test = X_sents[20000:], y_sents[20000:]


    word_vectors = load_bin_vec(word_vecs, word2idx)
    add_unknown_words(word_vectors, word2idx)
    embedding_weights = np.zeros((vocab_size, emb_dim))
    for word, index in word2idx.items():
        embedding_weights[index,:] = word_vectors[word]

    model = load_vgg_weights(build_model(vocab_size,
                                         max_caption_len,
                                         emb_dim,
                                         embedding_weights),
                             weights_path)
    print "VGG 16 weights loaded."

    print_summary(model.layers)

    samples_per_epoch = X_sents_train.shape[0]
    checkpoint = ModelCheckpoint('../../weights.{epoch:02d}.h5')
Esempio n. 9
0
def train(model_type='parallel', label_set='full', drop_unk=False,
          word_vecs=None, setup_only=False):
    print "Loading data..."
    df = sentences_df(SENTENCES_CSV, labels=label_set, drop_unk=drop_unk)
    X, y, word2idx, l_enc = load_dataset(df, pad=True)
    print "X shape:", X.shape
    y_orig = y
    y_binary = to_categorical(y)
    labels = np.unique(y_orig)
    nb_labels = labels.shape[0]
    if drop_unk:
        label_set_str = label_set + ' (-unk)'
    else:
        label_set_str = label_set
    print "Number of labels: %i [%s]" % (nb_labels, label_set_str)
    if nb_labels > 2:
        y = y_binary
    maxlen = X.shape[1]
    vocab_size = len(word2idx) + 1 # 0 masking
    if pretrained_embeddings is True:
        word_vectors = load_bin_vec(word_vecs, word2idx)
        add_unknown_words(word_vectors, word2idx)
        embedding_weights = np.zeros((vocab_size+1, emb_dim))
        for word, index in word2idx.items():
            embedding_weights[index,:] = word_vectors[word]
    else:
        embedding_weights = None
    print "Data loaded."

    if setup_only:
        cnn = create_model(vocab_size, nb_labels, emb_dim, maxlen,
                           embedding_weights, filter_hs, nb_filters,
                           dropout_p, trainable_embeddings,
                           pretrained_embeddings, model_type=model_type)
        return {'X': X,
                'y': y,
                'word2idx': word2idx,
                'l_enc': l_enc,
                'y_binary': y_binary,
                'labels': labels,
                'nb_labels': nb_labels,
                'maxlen': maxlen,
                'emb_dim': emb_dim,
                'vocab_size': vocab_size,
                'embedding_weights': embedding_weights,
                'cnn': cnn}

    params = [('filter_hs',filter_hs), ('nb_filters',nb_filters),
              ('dropout_p',dropout_p),
              ('trainable_embeddings',trainable_embeddings),
              ('pretrained_embeddings',pretrained_embeddings),
              ('batch_size',batch_size), ('nb_epoch',nb_epoch),
              ('lr',lr), ('beta_1',beta_1), ('beta_2',beta_2),
              ('epsilon',epsilon)]
    print "\nModel type: %s" % model_type
    for (name, value) in params:
        print name + ':', value

    skf = StratifiedKFold(y_orig, n_folds=10, shuffle=True, random_state=0)
    cv_scores = []
    for i, (train, test) in enumerate(skf):
        start_time = time.time()
        cnn = None
        cnn = create_model(vocab_size,
                           nb_labels,
                           emb_dim,
                           maxlen,
                           embedding_weights,
                           filter_hs,
                           nb_filters,
                           dropout_p,
                           trainable_embeddings,
                           pretrained_embeddings,
                           model_type=model_type)
        if i == 0:
            print_summary(cnn.model.layers)

        acc = train_and_test_model(cnn, X[train], y[train], X[test], y[test],
                                   batch_size, nb_epoch,
                                   lr, beta_1, beta_2, epsilon)
        cv_scores.append(acc)
        train_time = time.time() - start_time
        print('\nLabel frequencies in y[test]')
        print_label_frequencies((y_orig[test], l_enc))
        y_pred = cnn.model.predict(X[test])
        y_pred = probas_to_classes(y_pred)
        c = Counter(y_pred)
        total = float(len(y_pred))
        print('\nLabel frequencies in predict(y[test])')
        for label, count in c.most_common():
            print l_enc.inverse_transform(label), count, count / total
        print "fold %i/10 - time: %.2f s - acc: %.4f on %i samples" % \
            (i+1, train_time, acc, len(test))
    print "Avg cv accuracy: %.4f" % np.mean(cv_scores)