Esempio n. 1
0
def train_test_bow(ngram_order, batch_size=128, n_epoch=3):
    label_sets = ['full', 'function', '3way', 'in_out', 'man_nat']
    for label_set in label_sets:
        # need to drop unk for full/function
        if label_set in ['full', 'function']:
            df = sentences_df(labels=label_set, drop_unk=True)
        else:
            df = sentences_df(SENTENCES_CSV, labels=label_set, drop_unk=False)
        X, y, word2idx, l_enc = load_dataset(df, ngram_order=ngram_order)
        print "X shape: %s" % (X.shape,)
        print "y shape: %s" % (y.shape,)
        skf = StratifiedKFold(y, n_folds=10, shuffle=True, random_state=0)
        scores = []
        for (train, test) in skf:
            clf = None
            clf = SGDClassifier(loss='log',
                                alpha=0.001,
                                l1_ratio=0,
                                random_state=0)
            for epoch in range(n_epoch):
                X_train, y_train, X_test, y_test = X[train], y[train], X[test], y[test]
                n_batches = X_train.shape[0] // batch_size
                for minibatch_idx in range(n_batches):
                    clf.partial_fit(
                        X_train[minibatch_idx * batch_size : (minibatch_idx+1) * batch_size],
                        y_train[minibatch_idx * batch_size : (minibatch_idx+1) * batch_size],
                        classes=np.unique(y))
                print "Epoch: %d/%d Train acc: %.4f" \
                    % (epoch+1, n_epoch, clf.score(X_train, y_train))
            fold_score = clf.score(X_test, y_test)
            print "Fold acc: %.4f" % fold_score
            scores.append(fold_score)
        print '%s label mean cv accuracy: %.4f\n' % (label_set, np.mean(scores))
Esempio n. 2
0
def train(label_set='full', pool_mode='sum', layer_sizes=[512, 256],
          activation='prelu', drop_unk=False, word_vecs=None,
          return_net=False, cv=10, val_split=0.00, label_unk=False):
    print "Loading data..."
    df = sentences_df(SENTENCES_CSV, labels=label_set, drop_unk=drop_unk)
    X, y, word2idx, l_enc = load_dataset(df, ngram_order=1, pad=True)
    print "X shape:", X.shape
    y_orig = y
    y_binary = to_categorical(y)
    labels = np.unique(y_orig)
    nb_labels = labels.shape[0]
    if drop_unk:
        label_set_str = label_set + ' (-unk)'
    else:
        label_set_str = label_set
    print "Number of labels: %i [%s]" % (nb_labels, label_set_str)
    if nb_labels > 2:
        y = y_binary
    maxlen = X.shape[1]
    vocab_size = len(word2idx) + 1 # 0 masking
    word_vectors = load_bin_vec(word_vecs, word2idx)
    add_unknown_words(word_vectors, word2idx)

    embedding_weights = np.zeros((vocab_size+1, emb_dim))
    for word, index in word2idx.items():
        embedding_weights[index,:] = word_vectors[word]
Esempio n. 3
0
def main(rnn_layer='lstm', word_vecs=None):
    print "Loading data...",
    df = sentences_df(SENTENCES_CSV)
    X, y, word2idx, l_enc = load_dataset(df, pad=True)
    print X.shape
    y_binary = to_categorical(y)
    word_vectors = load_bin_vec(word_vecs, word2idx)
    add_unknown_words(word_vectors, word2idx)
    print "Data loaded."

    labels = np.unique(y)
    n_labels = labels.shape[0]
    max_len = X.shape[1]
    vocab_dim = 300
    n_vocab = len(word2idx) + 1 # 0 masking
    embedding_weights = np.zeros((n_vocab+1, vocab_dim))
    for word, index in word2idx.items():
        embedding_weights[index,:] = word_vectors[word]

    skf = StratifiedKFold(y, n_folds=10, shuffle=True, random_state=0)
    cv_scores = []
    for i, (train, test) in enumerate(skf):
        start_time = time.time()
        model = create_model(n_vocab, n_labels, vocab_dim,
                             embedding_weights, rnn_layer=rnn_layer)
        if i == 0:
            print_summary(model.layers)
        _, score = train_and_test_model(model, X[train], y_binary[train],
                                     X[test], y_binary[test])
        cv_scores.append(score)
        train_time = time.time() - start_time
        print "fold %i/10 - time: %.2f s - acc: %.4f on %i samples" % \
            (i+1, train_time, score, len(test))
    print "avg cv acc: %.4f" % np.mean(cv_scores)
Esempio n. 4
0
def train(label_set='full', drop_unk=False,
          word_vecs=None, setup_only=False, layer_sizes=[512,256],
          pool_mode='sum'):
    print "Loading data..."
    df = sentences_df(SENTENCES_CSV, labels=label_set, drop_unk=drop_unk)
    X, y, word2idx, l_enc = load_dataset(df, pad=True)
    print "X shape:", X.shape
    y_orig = y
    y_binary = to_categorical(y)
    labels = np.unique(y_orig)
    nb_labels = labels.shape[0]
    if drop_unk:
        label_set_str = label_set + ' (-unk)'
    else:
        label_set_str = label_set
    print "Number of labels: %i [%s]" % (nb_labels, label_set_str)
    if nb_labels > 2:
        y = y_binary
    maxlen = X.shape[1]
    vocab_size = len(word2idx) + 1 # 0 masking
    if pretrained_embeddings is True:
        word_vectors = load_bin_vec(word_vecs, word2idx)
        add_unknown_words(word_vectors, word2idx)
        embedding_weights = np.zeros((vocab_size+1, emb_dim))
        for word, index in word2idx.items():
            embedding_weights[index,:] = word_vectors[word]
    else:
        embedding_weights = None
    print "Data loaded."

    skf = StratifiedKFold(y_orig, n_folds=10, shuffle=True, random_state=0)
    cv_scores = []
    for i, (train, test) in enumerate(skf):
        start_time = time.time()
        nn = None
        nn = EnsembleNN(vocab_size, nb_labels, emb_dim, maxlen,
                        embedding_weights, filter_hs, nb_filters,
                        dropout_p, trainable_embeddings, pretrained_embeddings,
                        layer_sizes, pool_mode)
        if i == 0:
            print_summary(nn.model.layers)
        acc = train_and_test_model(nn, X[train], y[train], X[test], y[test],
                                   batch_size, nb_epoch,
                                   lr, beta_1, beta_2, epsilon)
        cv_scores.append(acc)
        train_time = time.time() - start_time
        print('\nLabel frequencies in y[test]')
        print_label_frequencies((y_orig[test], l_enc))
        y_pred = nn.model.predict(X[test])
        y_pred = probas_to_classes(y_pred)
        c = Counter(y_pred)
        total = float(len(y_pred))
        print('\nLabel frequencies in predict(y[test])')
        for label, count in c.most_common():
            print l_enc.inverse_transform(label), count, count / total
        print "fold %i/10 - time: %.2f s - acc: %.4f on %i samples" % \
            (i+1, train_time, acc, len(test))
    print "Avg cv accuracy: %.4f" % np.mean(cv_scores)
Esempio n. 5
0
def train_and_test_nb(ngram_order=1):
    print "ngram order:", ngram_order
    label_sets = ['full', 'function', '3way', 'in_out', 'man_nat']
    for label_set in label_sets:
        df = sentences_df(SENTENCES_CSV, labels=label_set)
        X, y, word2idx, l_enc = load_dataset(df, ngram_order=ngram_order)
        print "X shape: %s" % (X.shape,)
        print "y shape: %s" % (y.shape,)
        clf = BernoulliNB()
        scores = cross_val_score(clf, X, y, cv=10)
        print_label_frequencies(df)
        print '%s label mean cv accuracy: %.4f\n' % (label_set, np.mean(scores))
Esempio n. 6
0
def train_and_test_nb(ngram_order=1):
    print "ngram order:", ngram_order
    label_sets = ['full', 'function', '3way', 'in_out', 'man_nat']
    for label_set in label_sets:
        df = sentences_df(SENTENCES_CSV, labels=label_set)
        X, y, word2idx, l_enc = load_dataset(df, ngram_order=ngram_order)
        print "X shape: %s" % (X.shape, )
        print "y shape: %s" % (y.shape, )
        clf = BernoulliNB()
        scores = cross_val_score(clf, X, y, cv=10)
        print_label_frequencies(df)
        print '%s label mean cv accuracy: %.4f\n' % (label_set,
                                                     np.mean(scores))
Esempio n. 7
0
def train_test_bow(ngram_order, batch_size=128, n_epoch=3):
    label_sets = ['full', 'function', '3way', 'in_out', 'man_nat']
    for label_set in label_sets:
        # need to drop unk for full/function
        if label_set in ['full', 'function']:
            df = sentences_df(labels=label_set, drop_unk=True)
        else:
            df = sentences_df(SENTENCES_CSV, labels=label_set, drop_unk=False)
        X, y, word2idx, l_enc = load_dataset(df, ngram_order=ngram_order)
        print "X shape: %s" % (X.shape, )
        print "y shape: %s" % (y.shape, )
        skf = StratifiedKFold(y, n_folds=10, shuffle=True, random_state=0)
        scores = []
        for (train, test) in skf:
            clf = None
            clf = SGDClassifier(loss='log',
                                alpha=0.001,
                                l1_ratio=0,
                                random_state=0)
            for epoch in range(n_epoch):
                X_train, y_train, X_test, y_test = X[train], y[train], X[
                    test], y[test]
                n_batches = X_train.shape[0] // batch_size
                for minibatch_idx in range(n_batches):
                    clf.partial_fit(
                        X_train[minibatch_idx *
                                batch_size:(minibatch_idx + 1) * batch_size],
                        y_train[minibatch_idx *
                                batch_size:(minibatch_idx + 1) * batch_size],
                        classes=np.unique(y))
                print "Epoch: %d/%d Train acc: %.4f" \
                    % (epoch+1, n_epoch, clf.score(X_train, y_train))
            fold_score = clf.score(X_test, y_test)
            print "Fold acc: %.4f" % fold_score
            scores.append(fold_score)
        print '%s label mean cv accuracy: %.4f\n' % (label_set,
                                                     np.mean(scores))
Esempio n. 8
0
def main(rnn_layer='lstm', word_vecs=None):
    print "Loading data...",
    df = sentences_df(SENTENCES_CSV)
    X, y, word2idx, l_enc = load_dataset(df, pad=True)
    print X.shape
    y_binary = to_categorical(y)
    word_vectors = load_bin_vec(word_vecs, word2idx)
    add_unknown_words(word_vectors, word2idx)
    print "Data loaded."

    labels = np.unique(y)
    n_labels = labels.shape[0]
    max_len = X.shape[1]
    vocab_dim = 300
    n_vocab = len(word2idx) + 1  # 0 masking
    embedding_weights = np.zeros((n_vocab + 1, vocab_dim))
    for word, index in word2idx.items():
        embedding_weights[index, :] = word_vectors[word]

    skf = StratifiedKFold(y, n_folds=10, shuffle=True, random_state=0)
    cv_scores = []
    for i, (train, test) in enumerate(skf):
        start_time = time.time()
        model = create_model(n_vocab,
                             n_labels,
                             vocab_dim,
                             embedding_weights,
                             rnn_layer=rnn_layer)
        if i == 0:
            print_summary(model.layers)
        _, score = train_and_test_model(model, X[train], y_binary[train],
                                        X[test], y_binary[test])
        cv_scores.append(score)
        train_time = time.time() - start_time
        print "fold %i/10 - time: %.2f s - acc: %.4f on %i samples" % \
            (i+1, train_time, score, len(test))
    print "avg cv acc: %.4f" % np.mean(cv_scores)
Esempio n. 9
0
from keras.preprocessing.sequence import pad_sequences
from keras.utils.layer_utils import print_summary

sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
from preprocessing.data_utils import (
    load_bin_vec,
    sentences_df,
    add_unknown_words,
    load_dataset,
)

word_vecs = sys.argv[1]
captions_file = sys.argv[2]

df = sentences_df(labels='full', drop_unk=True)
X, y, word2idx, l_enc = load_dataset(df, ngram_order=1, pad=True)

elc_sents = set(list(df.sentence))
flickr_sents = []
with open('../../results_20130124.token') as f:
    for line in f:
        line = line.strip().lower().split('\t')[-1]
        if line not in elc_sents:
            flickr_sents.append(line)

idx = len(word2idx) + 1
for sent in flickr_sents:
    for word in re.split("-| ", sent):
        if word not in word2idx:
            word2idx[word] = idx
            idx += 1
Esempio n. 10
0
    scene_model = args.scenes

    df = sentences_df(keep_filename=True, special_tokens=True, drop_unk=True)
    imlookup = {}
    for imfile in set(df.img_file):
        impath = os.path.join(img_directory, imfile)
        im = scipy.misc.imread(impath)
        im = preprocess_im(im)
        imlookup[imfile] = im

    # X sentences should have the start token but not the end token,
    # since they'll be the input to the RNN.
    # Y sentences should have the end token, but not the start token,
    # since they'll be the predictions given by the RNN.
    print "Loading captions..."
    X, scene_labels, word2idx, l_enc = load_dataset(df, pad=True)
    with open('../../models/word2idx.json') as j:
        word2idx = json.load(j)
    with open('../../models/l_enc.pkl') as p:
        l_enc = pickle.load(p)

    # partial captions are word 0..n
    # next word is word n+1
    # create samples for n=1...N-1
    partial_captions = []
    next_words = []
    imfiles = []
    scenes = []
    for i, x_i in enumerate(X):
        indices = x_i[np.nonzero(x_i)[0]]
        for j in range(len(indices) - 1):
Esempio n. 11
0
def train(model_type='parallel',
          label_set='full',
          drop_unk=False,
          word_vecs=None,
          setup_only=False):
    print "Loading data..."
    df = sentences_df(SENTENCES_CSV, labels=label_set, drop_unk=drop_unk)
    X, y, word2idx, l_enc = load_dataset(df, pad=True)
    print "X shape:", X.shape
    y_orig = y
    y_binary = to_categorical(y)
    labels = np.unique(y_orig)
    nb_labels = labels.shape[0]
    if drop_unk:
        label_set_str = label_set + ' (-unk)'
    else:
        label_set_str = label_set
    print "Number of labels: %i [%s]" % (nb_labels, label_set_str)
    if nb_labels > 2:
        y = y_binary
    maxlen = X.shape[1]
    vocab_size = len(word2idx) + 1  # 0 masking
    if pretrained_embeddings is True:
        word_vectors = load_bin_vec(word_vecs, word2idx)
        add_unknown_words(word_vectors, word2idx)
        embedding_weights = np.zeros((vocab_size + 1, emb_dim))
        for word, index in word2idx.items():
            embedding_weights[index, :] = word_vectors[word]
    else:
        embedding_weights = None
    print "Data loaded."

    if setup_only:
        cnn = create_model(vocab_size,
                           nb_labels,
                           emb_dim,
                           maxlen,
                           embedding_weights,
                           filter_hs,
                           nb_filters,
                           dropout_p,
                           trainable_embeddings,
                           pretrained_embeddings,
                           model_type=model_type)
        return {
            'X': X,
            'y': y,
            'word2idx': word2idx,
            'l_enc': l_enc,
            'y_binary': y_binary,
            'labels': labels,
            'nb_labels': nb_labels,
            'maxlen': maxlen,
            'emb_dim': emb_dim,
            'vocab_size': vocab_size,
            'embedding_weights': embedding_weights,
            'cnn': cnn
        }

    params = [('filter_hs', filter_hs), ('nb_filters', nb_filters),
              ('dropout_p', dropout_p),
              ('trainable_embeddings', trainable_embeddings),
              ('pretrained_embeddings', pretrained_embeddings),
              ('batch_size', batch_size), ('nb_epoch', nb_epoch), ('lr', lr),
              ('beta_1', beta_1), ('beta_2', beta_2), ('epsilon', epsilon)]
    print "\nModel type: %s" % model_type
    for (name, value) in params:
        print name + ':', value

    skf = StratifiedKFold(y_orig, n_folds=10, shuffle=True, random_state=0)
    cv_scores = []
    for i, (train, test) in enumerate(skf):
        start_time = time.time()
        cnn = None
        cnn = create_model(vocab_size,
                           nb_labels,
                           emb_dim,
                           maxlen,
                           embedding_weights,
                           filter_hs,
                           nb_filters,
                           dropout_p,
                           trainable_embeddings,
                           pretrained_embeddings,
                           model_type=model_type)
        if i == 0:
            print_summary(cnn.model.layers)

        acc = train_and_test_model(cnn, X[train], y[train], X[test], y[test],
                                   batch_size, nb_epoch, lr, beta_1, beta_2,
                                   epsilon)
        cv_scores.append(acc)
        train_time = time.time() - start_time
        print('\nLabel frequencies in y[test]')
        print_label_frequencies((y_orig[test], l_enc))
        y_pred = cnn.model.predict(X[test])
        y_pred = probas_to_classes(y_pred)
        c = Counter(y_pred)
        total = float(len(y_pred))
        print('\nLabel frequencies in predict(y[test])')
        for label, count in c.most_common():
            print l_enc.inverse_transform(label), count, count / total
        print "fold %i/10 - time: %.2f s - acc: %.4f on %i samples" % \
            (i+1, train_time, acc, len(test))
    print "Avg cv accuracy: %.4f" % np.mean(cv_scores)
Esempio n. 12
0
def train(model_type='parallel', label_set='full', drop_unk=False,
          word_vecs=None, setup_only=False):
    print "Loading data..."
    df = sentences_df(SENTENCES_CSV, labels=label_set, drop_unk=drop_unk)
    X, y, word2idx, l_enc = load_dataset(df, pad=True)
    print "X shape:", X.shape
    y_orig = y
    y_binary = to_categorical(y)
    labels = np.unique(y_orig)
    nb_labels = labels.shape[0]
    if drop_unk:
        label_set_str = label_set + ' (-unk)'
    else:
        label_set_str = label_set
    print "Number of labels: %i [%s]" % (nb_labels, label_set_str)
    if nb_labels > 2:
        y = y_binary
    maxlen = X.shape[1]
    vocab_size = len(word2idx) + 1 # 0 masking
    if pretrained_embeddings is True:
        word_vectors = load_bin_vec(word_vecs, word2idx)
        add_unknown_words(word_vectors, word2idx)
        embedding_weights = np.zeros((vocab_size+1, emb_dim))
        for word, index in word2idx.items():
            embedding_weights[index,:] = word_vectors[word]
    else:
        embedding_weights = None
    print "Data loaded."

    if setup_only:
        cnn = create_model(vocab_size, nb_labels, emb_dim, maxlen,
                           embedding_weights, filter_hs, nb_filters,
                           dropout_p, trainable_embeddings,
                           pretrained_embeddings, model_type=model_type)
        return {'X': X,
                'y': y,
                'word2idx': word2idx,
                'l_enc': l_enc,
                'y_binary': y_binary,
                'labels': labels,
                'nb_labels': nb_labels,
                'maxlen': maxlen,
                'emb_dim': emb_dim,
                'vocab_size': vocab_size,
                'embedding_weights': embedding_weights,
                'cnn': cnn}

    params = [('filter_hs',filter_hs), ('nb_filters',nb_filters),
              ('dropout_p',dropout_p),
              ('trainable_embeddings',trainable_embeddings),
              ('pretrained_embeddings',pretrained_embeddings),
              ('batch_size',batch_size), ('nb_epoch',nb_epoch),
              ('lr',lr), ('beta_1',beta_1), ('beta_2',beta_2),
              ('epsilon',epsilon)]
    print "\nModel type: %s" % model_type
    for (name, value) in params:
        print name + ':', value

    skf = StratifiedKFold(y_orig, n_folds=10, shuffle=True, random_state=0)
    cv_scores = []
    for i, (train, test) in enumerate(skf):
        start_time = time.time()
        cnn = None
        cnn = create_model(vocab_size,
                           nb_labels,
                           emb_dim,
                           maxlen,
                           embedding_weights,
                           filter_hs,
                           nb_filters,
                           dropout_p,
                           trainable_embeddings,
                           pretrained_embeddings,
                           model_type=model_type)
        if i == 0:
            print_summary(cnn.model.layers)

        acc = train_and_test_model(cnn, X[train], y[train], X[test], y[test],
                                   batch_size, nb_epoch,
                                   lr, beta_1, beta_2, epsilon)
        cv_scores.append(acc)
        train_time = time.time() - start_time
        print('\nLabel frequencies in y[test]')
        print_label_frequencies((y_orig[test], l_enc))
        y_pred = cnn.model.predict(X[test])
        y_pred = probas_to_classes(y_pred)
        c = Counter(y_pred)
        total = float(len(y_pred))
        print('\nLabel frequencies in predict(y[test])')
        for label, count in c.most_common():
            print l_enc.inverse_transform(label), count, count / total
        print "fold %i/10 - time: %.2f s - acc: %.4f on %i samples" % \
            (i+1, train_time, acc, len(test))
    print "Avg cv accuracy: %.4f" % np.mean(cv_scores)
Esempio n. 13
0
from keras.preprocessing.sequence import pad_sequences
from keras.utils.layer_utils import print_summary

sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
from preprocessing.data_utils import (
    load_bin_vec,
    sentences_df,
    add_unknown_words,
    load_dataset,
)

word_vecs = sys.argv[1]
captions_file = sys.argv[2]

df = sentences_df(labels='full', drop_unk=True)
X, y, word2idx, l_enc = load_dataset(df, ngram_order=1, pad=True)

elc_sents = set(list(df.sentence))
flickr_sents = []
with open('../../results_20130124.token') as f:
    for line in f:
        line = line.strip().lower().split('\t')[-1]
        if line not in elc_sents:
            flickr_sents.append(line)

idx = len(word2idx) + 1
for sent in flickr_sents:
    for word in re.split("-| ", sent):
        if word not in word2idx:
            word2idx[word] = idx
            idx += 1
Esempio n. 14
0
                  word_vectors[idx2word[randidx]])
>>>>>>> 30acde2a62894bcba1348418b740e3c458303c27
    print "Data loaded."

    params = [('batch_size',batch_size), ('nb_epoch',nb_epoch),
              ('lr',lr), ('beta_1',beta_1), ('beta_2',beta_2),
              ('epsilon',epsilon)]
    for (name, value) in params:
        print name + ':', value

    if label_unk:
        unk_df = pd.read_csv(SENTENCES_CSV)
        unk_df = unk_df[(unk_df.q3 == 'other_unclear') | (unk_df.q4 == 'other_unclear')]
        df2 = create_unk_labeled_instances(unk_df)
        df3 = sentences_df(label_unk=df2)
        X_unk, y_unk, _, _ = load_dataset(df3, pad=True)

        X_train, X_test, y_train, y_test = train_test_split(
            X, y, stratify=y_orig, test_size=0.2, random_state=0)

        print "Training and testing without labeled unknown on %i samples" % len(X_train)
        nn1 = FeedforwardNN(vocab_size,
                            nb_labels,
                            emb_dim,
                            maxlen,
                            layer_sizes,
                            activation,
                            embedding_weights,
                            pool_mode=pool_mode)

        _, acc = train_and_test_model(nn1, X_train, y_train, X_test, y_test,
Esempio n. 15
0
    df = sentences_df(keep_filename=True, special_tokens=True, drop_unk=True)

    # sort df by ImageCollection ordering
    img_order = [f.split('/')[-1] for f in ic.files]
    df['img_file_ord'] = pd.Categorical(
        df['img_file'],
        categories=img_order,
        ordered=True)
    df = df.sort_values(by='img_file_ord')

    # X sentences should have the start token but not the end token,
    # since they'll be the input to the RNN.
    # Y sentences should have the end token, but not the start token,
    # since they'll be the predictions given by the RNN.
    print "Loading captions..."
    _, _, word2idx, _ = load_dataset(df, pad=True, truncate=True)
    Xdf = df.copy()
    ydf = df.copy()
    Xdf['sentence'] = Xdf['sentence'].apply(lambda x: ' '.join(x.split()[:-1]))
    ydf['sentence'] = ydf['sentence'].apply(lambda x: ' '.join(x.split()[1:]))
    X_sents, _, _, _ = load_dataset(Xdf, pad=True, word2idx=word2idx)
    y_sents, _, _, _ = load_dataset(ydf, pad=True, word2idx=word2idx)
    scene_labels = Xdf.label

    vocab_size = len(word2idx) + 1
    max_caption_len = X_sents.shape[1]

    ims_train, ims_test = ic[:4000], ic[4000:]
    X_sents_train, y_sents_train = X_sents[:20000], y_sents[:20000]
    X_sents_test, y_sents_test = X_sents[20000:], y_sents[20000:]
Esempio n. 16
0
    scene_model = args.scenes

    df = sentences_df(keep_filename=True, special_tokens=True, drop_unk=True)
    imlookup = {}
    for imfile in set(df.img_file):
        impath = os.path.join(img_directory, imfile)
        im = scipy.misc.imread(impath)
        im = preprocess_im(im)
        imlookup[imfile] = im

    # X sentences should have the start token but not the end token,
    # since they'll be the input to the RNN.
    # Y sentences should have the end token, but not the start token,
    # since they'll be the predictions given by the RNN.
    print "Loading captions..."
    X, scene_labels, word2idx, l_enc = load_dataset(df, pad=True)
    with open('../../models/word2idx.json') as j:
        word2idx = json.load(j)
    with open('../../models/l_enc.pkl') as p:
        l_enc = pickle.load(p)

    # partial captions are word 0..n
    # next word is word n+1
    # create samples for n=1...N-1
    partial_captions = []
    next_words = []
    imfiles = []
    scenes = []
    for i, x_i in enumerate(X):
        indices = x_i[np.nonzero(x_i)[0]]
        for j in range(len(indices)-1):
Esempio n. 17
0
    load_bin_vec,
    add_unknown_words
)
from paths import GENERATED_TEXT

def log(content, outfile):
    """Write to stdout and outfile. outfile is open file"""
    outfile.write(content)
    sys.stdout.write(content)

# load sentences
df = sentences_df(labels='function')
labels = np.unique(df.label)
text = '\n'.join(s for s in df.sentence.values).split()

sents, _, word2idx, l_enc = load_dataset(df, pad=True)
idx2word = {i: w for w,i in word2idx.items()}
maxlen = sents.shape[1]
vocab = word2idx.keys()

# cut X into sequences of 10 words
# given 9 words, predict 10th
seqlen = 9
step = 2
sentences = []
next_words = []
mask = np.nonzero(sents)
text = sents[mask].astype(np.int32)
for i in range(0, len(text) - seqlen, step):
    sentences.append(text[i: i + seqlen])
    next_words.append(text[i + seqlen])