Esempio n. 1
0
def train_test_bow(ngram_order, batch_size=128, n_epoch=3):
    label_sets = ['full', 'function', '3way', 'in_out', 'man_nat']
    for label_set in label_sets:
        # need to drop unk for full/function
        if label_set in ['full', 'function']:
            df = sentences_df(labels=label_set, drop_unk=True)
        else:
            df = sentences_df(SENTENCES_CSV, labels=label_set, drop_unk=False)
        X, y, word2idx, l_enc = load_dataset(df, ngram_order=ngram_order)
        print "X shape: %s" % (X.shape,)
        print "y shape: %s" % (y.shape,)
        skf = StratifiedKFold(y, n_folds=10, shuffle=True, random_state=0)
        scores = []
        for (train, test) in skf:
            clf = None
            clf = SGDClassifier(loss='log',
                                alpha=0.001,
                                l1_ratio=0,
                                random_state=0)
            for epoch in range(n_epoch):
                X_train, y_train, X_test, y_test = X[train], y[train], X[test], y[test]
                n_batches = X_train.shape[0] // batch_size
                for minibatch_idx in range(n_batches):
                    clf.partial_fit(
                        X_train[minibatch_idx * batch_size : (minibatch_idx+1) * batch_size],
                        y_train[minibatch_idx * batch_size : (minibatch_idx+1) * batch_size],
                        classes=np.unique(y))
                print "Epoch: %d/%d Train acc: %.4f" \
                    % (epoch+1, n_epoch, clf.score(X_train, y_train))
            fold_score = clf.score(X_test, y_test)
            print "Fold acc: %.4f" % fold_score
            scores.append(fold_score)
        print '%s label mean cv accuracy: %.4f\n' % (label_set, np.mean(scores))
Esempio n. 2
0
def train(label_set='full', pool_mode='sum', layer_sizes=[512, 256],
          activation='prelu', drop_unk=False, word_vecs=None,
          return_net=False, cv=10, val_split=0.00, label_unk=False):
    print "Loading data..."
    df = sentences_df(SENTENCES_CSV, labels=label_set, drop_unk=drop_unk)
    X, y, word2idx, l_enc = load_dataset(df, ngram_order=1, pad=True)
    print "X shape:", X.shape
    y_orig = y
    y_binary = to_categorical(y)
    labels = np.unique(y_orig)
    nb_labels = labels.shape[0]
    if drop_unk:
        label_set_str = label_set + ' (-unk)'
    else:
        label_set_str = label_set
    print "Number of labels: %i [%s]" % (nb_labels, label_set_str)
    if nb_labels > 2:
        y = y_binary
    maxlen = X.shape[1]
    vocab_size = len(word2idx) + 1 # 0 masking
    word_vectors = load_bin_vec(word_vecs, word2idx)
    add_unknown_words(word_vectors, word2idx)

    embedding_weights = np.zeros((vocab_size+1, emb_dim))
    for word, index in word2idx.items():
        embedding_weights[index,:] = word_vectors[word]
Esempio n. 3
0
def main(rnn_layer='lstm', word_vecs=None):
    print "Loading data...",
    df = sentences_df(SENTENCES_CSV)
    X, y, word2idx, l_enc = load_dataset(df, pad=True)
    print X.shape
    y_binary = to_categorical(y)
    word_vectors = load_bin_vec(word_vecs, word2idx)
    add_unknown_words(word_vectors, word2idx)
    print "Data loaded."

    labels = np.unique(y)
    n_labels = labels.shape[0]
    max_len = X.shape[1]
    vocab_dim = 300
    n_vocab = len(word2idx) + 1 # 0 masking
    embedding_weights = np.zeros((n_vocab+1, vocab_dim))
    for word, index in word2idx.items():
        embedding_weights[index,:] = word_vectors[word]

    skf = StratifiedKFold(y, n_folds=10, shuffle=True, random_state=0)
    cv_scores = []
    for i, (train, test) in enumerate(skf):
        start_time = time.time()
        model = create_model(n_vocab, n_labels, vocab_dim,
                             embedding_weights, rnn_layer=rnn_layer)
        if i == 0:
            print_summary(model.layers)
        _, score = train_and_test_model(model, X[train], y_binary[train],
                                     X[test], y_binary[test])
        cv_scores.append(score)
        train_time = time.time() - start_time
        print "fold %i/10 - time: %.2f s - acc: %.4f on %i samples" % \
            (i+1, train_time, score, len(test))
    print "avg cv acc: %.4f" % np.mean(cv_scores)
Esempio n. 4
0
def main():
    label_set = 'function'
    df = sentences_df(labels=label_set)
    labels = np.unique(df.label.values)

    out = open(GENERATED_TEXT, 'a')

    for label in labels:
        label_df = df[df.label == label]
        sents = label_df['sentence'].values
        text = '\n'.join(sent for sent in sents)
        print 'corpus length:', len(text)

        chars = sorted(list(set(text)))
        print 'total chars:', len(chars)
        char_indices = dict((c, i) for i, c in enumerate(chars))
        indices_char = dict((i, c) for i, c in enumerate(chars))

        # cut text into sequences of maxlen chars
        maxlen = 60
        step = 3
        sentences = []
        next_chars = []
        for i in range(0, len(text) - maxlen, step):
            sentences.append(text[i:i + maxlen])
            next_chars.append(text[i + maxlen])
        print 'nb sequences:', len(sentences)

        print 'Vectorization...'
        X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
        y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
        for i, sentence in enumerate(sentences):
            for t, char in enumerate(sentence):
                X[i, t, char_indices[char]] = 1
            y[i, char_indices[next_chars[i]]] = 1

        print 'Build model...'
        model = Sequential()
        model.add(
            GRU(512,
                return_sequences=True,
                input_shape=(maxlen, len(chars)),
                activation='relu'))
        model.add(Dropout(0.5))
        #model.add(GRU(512, return_sequences=True, activation='relu'))
        #model.add(Dropout(0.5))
        model.add(GRU(512, return_sequences=False, activation='relu'))
        model.add(Dropout(0.5))
        model.add(Dense(len(chars), activation='softmax'))

        model.compile(loss='categorical_crossentropy', optimizer='adam')
        print_summary(model.layers)

        log('label: %s\n' % label, out)
        train_and_generate(600, X, y, model, text, maxlen, chars, indices_char,
                           char_indices, out)

        model.save_weights('%s.h5' % label)

    out.close()
Esempio n. 5
0
def train(label_set='full', drop_unk=False,
          word_vecs=None, setup_only=False, layer_sizes=[512,256],
          pool_mode='sum'):
    print "Loading data..."
    df = sentences_df(SENTENCES_CSV, labels=label_set, drop_unk=drop_unk)
    X, y, word2idx, l_enc = load_dataset(df, pad=True)
    print "X shape:", X.shape
    y_orig = y
    y_binary = to_categorical(y)
    labels = np.unique(y_orig)
    nb_labels = labels.shape[0]
    if drop_unk:
        label_set_str = label_set + ' (-unk)'
    else:
        label_set_str = label_set
    print "Number of labels: %i [%s]" % (nb_labels, label_set_str)
    if nb_labels > 2:
        y = y_binary
    maxlen = X.shape[1]
    vocab_size = len(word2idx) + 1 # 0 masking
    if pretrained_embeddings is True:
        word_vectors = load_bin_vec(word_vecs, word2idx)
        add_unknown_words(word_vectors, word2idx)
        embedding_weights = np.zeros((vocab_size+1, emb_dim))
        for word, index in word2idx.items():
            embedding_weights[index,:] = word_vectors[word]
    else:
        embedding_weights = None
    print "Data loaded."

    skf = StratifiedKFold(y_orig, n_folds=10, shuffle=True, random_state=0)
    cv_scores = []
    for i, (train, test) in enumerate(skf):
        start_time = time.time()
        nn = None
        nn = EnsembleNN(vocab_size, nb_labels, emb_dim, maxlen,
                        embedding_weights, filter_hs, nb_filters,
                        dropout_p, trainable_embeddings, pretrained_embeddings,
                        layer_sizes, pool_mode)
        if i == 0:
            print_summary(nn.model.layers)
        acc = train_and_test_model(nn, X[train], y[train], X[test], y[test],
                                   batch_size, nb_epoch,
                                   lr, beta_1, beta_2, epsilon)
        cv_scores.append(acc)
        train_time = time.time() - start_time
        print('\nLabel frequencies in y[test]')
        print_label_frequencies((y_orig[test], l_enc))
        y_pred = nn.model.predict(X[test])
        y_pred = probas_to_classes(y_pred)
        c = Counter(y_pred)
        total = float(len(y_pred))
        print('\nLabel frequencies in predict(y[test])')
        for label, count in c.most_common():
            print l_enc.inverse_transform(label), count, count / total
        print "fold %i/10 - time: %.2f s - acc: %.4f on %i samples" % \
            (i+1, train_time, acc, len(test))
    print "Avg cv accuracy: %.4f" % np.mean(cv_scores)
Esempio n. 6
0
def main():
    label_set = 'function'
    df = sentences_df(labels=label_set)
    labels = np.unique(df.label.values)

    out = open(GENERATED_TEXT, 'a')

    for label in labels:
        label_df = df[df.label == label]
        sents = label_df['sentence'].values
        text = '\n'.join(sent for sent in sents)
        print 'corpus length:', len(text)

        chars = sorted(list(set(text)))
        print 'total chars:', len(chars)
        char_indices = dict((c,i) for i,c in enumerate(chars))
        indices_char = dict((i,c) for i,c in enumerate(chars))

        # cut text into sequences of maxlen chars
        maxlen = 60
        step = 3
        sentences = []
        next_chars = []
        for i in range(0, len(text) - maxlen, step):
            sentences.append(text[i: i + maxlen])
            next_chars.append(text[i + maxlen])
        print 'nb sequences:', len(sentences)

        print 'Vectorization...'
        X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
        y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
        for i, sentence in enumerate(sentences):
            for t, char in enumerate(sentence):
                X[i, t, char_indices[char]] = 1
            y[i, char_indices[next_chars[i]]] = 1

        print 'Build model...'
        model = Sequential()
        model.add(GRU(512, return_sequences=True,
                      input_shape=(maxlen, len(chars)), activation='relu'))
        model.add(Dropout(0.5))
        #model.add(GRU(512, return_sequences=True, activation='relu'))
        #model.add(Dropout(0.5))
        model.add(GRU(512, return_sequences=False, activation='relu'))
        model.add(Dropout(0.5))
        model.add(Dense(len(chars), activation='softmax'))

        model.compile(loss='categorical_crossentropy', optimizer='adam')
        print_summary(model.layers)

        log('label: %s\n' % label, out)
        train_and_generate(600, X, y, model, text, maxlen, chars,
                           indices_char, char_indices, out)

        model.save_weights('%s.h5' % label)

    out.close()
Esempio n. 7
0
def train_and_test_nb(ngram_order=1):
    print "ngram order:", ngram_order
    label_sets = ['full', 'function', '3way', 'in_out', 'man_nat']
    for label_set in label_sets:
        df = sentences_df(SENTENCES_CSV, labels=label_set)
        X, y, word2idx, l_enc = load_dataset(df, ngram_order=ngram_order)
        print "X shape: %s" % (X.shape,)
        print "y shape: %s" % (y.shape,)
        clf = BernoulliNB()
        scores = cross_val_score(clf, X, y, cv=10)
        print_label_frequencies(df)
        print '%s label mean cv accuracy: %.4f\n' % (label_set, np.mean(scores))
Esempio n. 8
0
def train_and_test_nb(ngram_order=1):
    print "ngram order:", ngram_order
    label_sets = ['full', 'function', '3way', 'in_out', 'man_nat']
    for label_set in label_sets:
        df = sentences_df(SENTENCES_CSV, labels=label_set)
        X, y, word2idx, l_enc = load_dataset(df, ngram_order=ngram_order)
        print "X shape: %s" % (X.shape, )
        print "y shape: %s" % (y.shape, )
        clf = BernoulliNB()
        scores = cross_val_score(clf, X, y, cv=10)
        print_label_frequencies(df)
        print '%s label mean cv accuracy: %.4f\n' % (label_set,
                                                     np.mean(scores))
Esempio n. 9
0
def train_test_bow(ngram_order, batch_size=128, n_epoch=3):
    label_sets = ['full', 'function', '3way', 'in_out', 'man_nat']
    for label_set in label_sets:
        # need to drop unk for full/function
        if label_set in ['full', 'function']:
            df = sentences_df(labels=label_set, drop_unk=True)
        else:
            df = sentences_df(SENTENCES_CSV, labels=label_set, drop_unk=False)
        X, y, word2idx, l_enc = load_dataset(df, ngram_order=ngram_order)
        print "X shape: %s" % (X.shape, )
        print "y shape: %s" % (y.shape, )
        skf = StratifiedKFold(y, n_folds=10, shuffle=True, random_state=0)
        scores = []
        for (train, test) in skf:
            clf = None
            clf = SGDClassifier(loss='log',
                                alpha=0.001,
                                l1_ratio=0,
                                random_state=0)
            for epoch in range(n_epoch):
                X_train, y_train, X_test, y_test = X[train], y[train], X[
                    test], y[test]
                n_batches = X_train.shape[0] // batch_size
                for minibatch_idx in range(n_batches):
                    clf.partial_fit(
                        X_train[minibatch_idx *
                                batch_size:(minibatch_idx + 1) * batch_size],
                        y_train[minibatch_idx *
                                batch_size:(minibatch_idx + 1) * batch_size],
                        classes=np.unique(y))
                print "Epoch: %d/%d Train acc: %.4f" \
                    % (epoch+1, n_epoch, clf.score(X_train, y_train))
            fold_score = clf.score(X_test, y_test)
            print "Fold acc: %.4f" % fold_score
            scores.append(fold_score)
        print '%s label mean cv accuracy: %.4f\n' % (label_set,
                                                     np.mean(scores))
Esempio n. 10
0
def main(rnn_layer='lstm', word_vecs=None):
    print "Loading data...",
    df = sentences_df(SENTENCES_CSV)
    X, y, word2idx, l_enc = load_dataset(df, pad=True)
    print X.shape
    y_binary = to_categorical(y)
    word_vectors = load_bin_vec(word_vecs, word2idx)
    add_unknown_words(word_vectors, word2idx)
    print "Data loaded."

    labels = np.unique(y)
    n_labels = labels.shape[0]
    max_len = X.shape[1]
    vocab_dim = 300
    n_vocab = len(word2idx) + 1  # 0 masking
    embedding_weights = np.zeros((n_vocab + 1, vocab_dim))
    for word, index in word2idx.items():
        embedding_weights[index, :] = word_vectors[word]

    skf = StratifiedKFold(y, n_folds=10, shuffle=True, random_state=0)
    cv_scores = []
    for i, (train, test) in enumerate(skf):
        start_time = time.time()
        model = create_model(n_vocab,
                             n_labels,
                             vocab_dim,
                             embedding_weights,
                             rnn_layer=rnn_layer)
        if i == 0:
            print_summary(model.layers)
        _, score = train_and_test_model(model, X[train], y_binary[train],
                                        X[test], y_binary[test])
        cv_scores.append(score)
        train_time = time.time() - start_time
        print "fold %i/10 - time: %.2f s - acc: %.4f on %i samples" % \
            (i+1, train_time, score, len(test))
    print "avg cv acc: %.4f" % np.mean(cv_scores)
Esempio n. 11
0
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
from preprocessing.data_utils import (
    sentences_df,
    load_dataset,
    load_bin_vec,
    add_unknown_words
)
from paths import GENERATED_TEXT

def log(content, outfile):
    """Write to stdout and outfile. outfile is open file"""
    outfile.write(content)
    sys.stdout.write(content)

# load sentences
df = sentences_df(labels='function')
labels = np.unique(df.label)
text = '\n'.join(s for s in df.sentence.values).split()

sents, _, word2idx, l_enc = load_dataset(df, pad=True)
idx2word = {i: w for w,i in word2idx.items()}
maxlen = sents.shape[1]
vocab = word2idx.keys()

# cut X into sequences of 10 words
# given 9 words, predict 10th
seqlen = 9
step = 2
sentences = []
next_words = []
mask = np.nonzero(sents)
Esempio n. 12
0
    parser.add_argument('--dim', type=int)
    parser.add_argument('--vggweights', type=str)
    parser.add_argument('--modelweights', type=str, required=False)
    parser.add_argument('--wordvecs', type=str)
    parser.add_argument('--images', type=str)
    parser.add_argument('--scenes', action='store_true')
    args = parser.parse_args()

    vggweights_path = args.vggweights
    modelweights_path = args.modelweights
    word_vecs = args.wordvecs
    img_directory = args.images
    emb_dim = args.dim
    scene_model = args.scenes

    df = sentences_df(keep_filename=True, special_tokens=True, drop_unk=True)
    imlookup = {}
    for imfile in set(df.img_file):
        impath = os.path.join(img_directory, imfile)
        im = scipy.misc.imread(impath)
        im = preprocess_im(im)
        imlookup[imfile] = im

    # X sentences should have the start token but not the end token,
    # since they'll be the input to the RNN.
    # Y sentences should have the end token, but not the start token,
    # since they'll be the predictions given by the RNN.
    print "Loading captions..."
    X, scene_labels, word2idx, l_enc = load_dataset(df, pad=True)
    with open('../../models/word2idx.json') as j:
        word2idx = json.load(j)
Esempio n. 13
0
def train(model_type='parallel',
          label_set='full',
          drop_unk=False,
          word_vecs=None,
          setup_only=False):
    print "Loading data..."
    df = sentences_df(SENTENCES_CSV, labels=label_set, drop_unk=drop_unk)
    X, y, word2idx, l_enc = load_dataset(df, pad=True)
    print "X shape:", X.shape
    y_orig = y
    y_binary = to_categorical(y)
    labels = np.unique(y_orig)
    nb_labels = labels.shape[0]
    if drop_unk:
        label_set_str = label_set + ' (-unk)'
    else:
        label_set_str = label_set
    print "Number of labels: %i [%s]" % (nb_labels, label_set_str)
    if nb_labels > 2:
        y = y_binary
    maxlen = X.shape[1]
    vocab_size = len(word2idx) + 1  # 0 masking
    if pretrained_embeddings is True:
        word_vectors = load_bin_vec(word_vecs, word2idx)
        add_unknown_words(word_vectors, word2idx)
        embedding_weights = np.zeros((vocab_size + 1, emb_dim))
        for word, index in word2idx.items():
            embedding_weights[index, :] = word_vectors[word]
    else:
        embedding_weights = None
    print "Data loaded."

    if setup_only:
        cnn = create_model(vocab_size,
                           nb_labels,
                           emb_dim,
                           maxlen,
                           embedding_weights,
                           filter_hs,
                           nb_filters,
                           dropout_p,
                           trainable_embeddings,
                           pretrained_embeddings,
                           model_type=model_type)
        return {
            'X': X,
            'y': y,
            'word2idx': word2idx,
            'l_enc': l_enc,
            'y_binary': y_binary,
            'labels': labels,
            'nb_labels': nb_labels,
            'maxlen': maxlen,
            'emb_dim': emb_dim,
            'vocab_size': vocab_size,
            'embedding_weights': embedding_weights,
            'cnn': cnn
        }

    params = [('filter_hs', filter_hs), ('nb_filters', nb_filters),
              ('dropout_p', dropout_p),
              ('trainable_embeddings', trainable_embeddings),
              ('pretrained_embeddings', pretrained_embeddings),
              ('batch_size', batch_size), ('nb_epoch', nb_epoch), ('lr', lr),
              ('beta_1', beta_1), ('beta_2', beta_2), ('epsilon', epsilon)]
    print "\nModel type: %s" % model_type
    for (name, value) in params:
        print name + ':', value

    skf = StratifiedKFold(y_orig, n_folds=10, shuffle=True, random_state=0)
    cv_scores = []
    for i, (train, test) in enumerate(skf):
        start_time = time.time()
        cnn = None
        cnn = create_model(vocab_size,
                           nb_labels,
                           emb_dim,
                           maxlen,
                           embedding_weights,
                           filter_hs,
                           nb_filters,
                           dropout_p,
                           trainable_embeddings,
                           pretrained_embeddings,
                           model_type=model_type)
        if i == 0:
            print_summary(cnn.model.layers)

        acc = train_and_test_model(cnn, X[train], y[train], X[test], y[test],
                                   batch_size, nb_epoch, lr, beta_1, beta_2,
                                   epsilon)
        cv_scores.append(acc)
        train_time = time.time() - start_time
        print('\nLabel frequencies in y[test]')
        print_label_frequencies((y_orig[test], l_enc))
        y_pred = cnn.model.predict(X[test])
        y_pred = probas_to_classes(y_pred)
        c = Counter(y_pred)
        total = float(len(y_pred))
        print('\nLabel frequencies in predict(y[test])')
        for label, count in c.most_common():
            print l_enc.inverse_transform(label), count, count / total
        print "fold %i/10 - time: %.2f s - acc: %.4f on %i samples" % \
            (i+1, train_time, acc, len(test))
    print "Avg cv accuracy: %.4f" % np.mean(cv_scores)
Esempio n. 14
0
    assert np.all(embedding_weights[randidx] ==
                  word_vectors[idx2word[randidx]])
>>>>>>> 30acde2a62894bcba1348418b740e3c458303c27
    print "Data loaded."

    params = [('batch_size',batch_size), ('nb_epoch',nb_epoch),
              ('lr',lr), ('beta_1',beta_1), ('beta_2',beta_2),
              ('epsilon',epsilon)]
    for (name, value) in params:
        print name + ':', value

    if label_unk:
        unk_df = pd.read_csv(SENTENCES_CSV)
        unk_df = unk_df[(unk_df.q3 == 'other_unclear') | (unk_df.q4 == 'other_unclear')]
        df2 = create_unk_labeled_instances(unk_df)
        df3 = sentences_df(label_unk=df2)
        X_unk, y_unk, _, _ = load_dataset(df3, pad=True)

        X_train, X_test, y_train, y_test = train_test_split(
            X, y, stratify=y_orig, test_size=0.2, random_state=0)

        print "Training and testing without labeled unknown on %i samples" % len(X_train)
        nn1 = FeedforwardNN(vocab_size,
                            nb_labels,
                            emb_dim,
                            maxlen,
                            layer_sizes,
                            activation,
                            embedding_weights,
                            pool_mode=pool_mode)
Esempio n. 15
0
    parser.add_argument('--dim', type=int)
    parser.add_argument('--weights', type=str)
    parser.add_argument('--wordvecs', type=str)
    parser.add_argument('--images', type=str)
    args = parser.parse_args()

    weights_path = args.weights
    word_vecs = args.wordvecs
    img_directory = args.images
    emb_dim = args.dim

    print "Loading images..."
    # load images and captions
    ic = skimage.io.imread_collection(os.path.join(img_directory, '*.jpg'),
                                      conserve_memory=True)
    df = sentences_df(keep_filename=True, special_tokens=True, drop_unk=True)

    # sort df by ImageCollection ordering
    img_order = [f.split('/')[-1] for f in ic.files]
    df['img_file_ord'] = pd.Categorical(
        df['img_file'],
        categories=img_order,
        ordered=True)
    df = df.sort_values(by='img_file_ord')

    # X sentences should have the start token but not the end token,
    # since they'll be the input to the RNN.
    # Y sentences should have the end token, but not the start token,
    # since they'll be the predictions given by the RNN.
    print "Loading captions..."
    _, _, word2idx, _ = load_dataset(df, pad=True, truncate=True)
Esempio n. 16
0
from keras.utils.np_utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.utils.layer_utils import print_summary

sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
from preprocessing.data_utils import (
    load_bin_vec,
    sentences_df,
    add_unknown_words,
    load_dataset,
)

word_vecs = sys.argv[1]
captions_file = sys.argv[2]

df = sentences_df(labels='full', drop_unk=True)
X, y, word2idx, l_enc = load_dataset(df, ngram_order=1, pad=True)

elc_sents = set(list(df.sentence))
flickr_sents = []
with open('../../results_20130124.token') as f:
    for line in f:
        line = line.strip().lower().split('\t')[-1]
        if line not in elc_sents:
            flickr_sents.append(line)

idx = len(word2idx) + 1
for sent in flickr_sents:
    for word in re.split("-| ", sent):
        if word not in word2idx:
            word2idx[word] = idx
Esempio n. 17
0
def train(model_type='parallel', label_set='full', drop_unk=False,
          word_vecs=None, setup_only=False):
    print "Loading data..."
    df = sentences_df(SENTENCES_CSV, labels=label_set, drop_unk=drop_unk)
    X, y, word2idx, l_enc = load_dataset(df, pad=True)
    print "X shape:", X.shape
    y_orig = y
    y_binary = to_categorical(y)
    labels = np.unique(y_orig)
    nb_labels = labels.shape[0]
    if drop_unk:
        label_set_str = label_set + ' (-unk)'
    else:
        label_set_str = label_set
    print "Number of labels: %i [%s]" % (nb_labels, label_set_str)
    if nb_labels > 2:
        y = y_binary
    maxlen = X.shape[1]
    vocab_size = len(word2idx) + 1 # 0 masking
    if pretrained_embeddings is True:
        word_vectors = load_bin_vec(word_vecs, word2idx)
        add_unknown_words(word_vectors, word2idx)
        embedding_weights = np.zeros((vocab_size+1, emb_dim))
        for word, index in word2idx.items():
            embedding_weights[index,:] = word_vectors[word]
    else:
        embedding_weights = None
    print "Data loaded."

    if setup_only:
        cnn = create_model(vocab_size, nb_labels, emb_dim, maxlen,
                           embedding_weights, filter_hs, nb_filters,
                           dropout_p, trainable_embeddings,
                           pretrained_embeddings, model_type=model_type)
        return {'X': X,
                'y': y,
                'word2idx': word2idx,
                'l_enc': l_enc,
                'y_binary': y_binary,
                'labels': labels,
                'nb_labels': nb_labels,
                'maxlen': maxlen,
                'emb_dim': emb_dim,
                'vocab_size': vocab_size,
                'embedding_weights': embedding_weights,
                'cnn': cnn}

    params = [('filter_hs',filter_hs), ('nb_filters',nb_filters),
              ('dropout_p',dropout_p),
              ('trainable_embeddings',trainable_embeddings),
              ('pretrained_embeddings',pretrained_embeddings),
              ('batch_size',batch_size), ('nb_epoch',nb_epoch),
              ('lr',lr), ('beta_1',beta_1), ('beta_2',beta_2),
              ('epsilon',epsilon)]
    print "\nModel type: %s" % model_type
    for (name, value) in params:
        print name + ':', value

    skf = StratifiedKFold(y_orig, n_folds=10, shuffle=True, random_state=0)
    cv_scores = []
    for i, (train, test) in enumerate(skf):
        start_time = time.time()
        cnn = None
        cnn = create_model(vocab_size,
                           nb_labels,
                           emb_dim,
                           maxlen,
                           embedding_weights,
                           filter_hs,
                           nb_filters,
                           dropout_p,
                           trainable_embeddings,
                           pretrained_embeddings,
                           model_type=model_type)
        if i == 0:
            print_summary(cnn.model.layers)

        acc = train_and_test_model(cnn, X[train], y[train], X[test], y[test],
                                   batch_size, nb_epoch,
                                   lr, beta_1, beta_2, epsilon)
        cv_scores.append(acc)
        train_time = time.time() - start_time
        print('\nLabel frequencies in y[test]')
        print_label_frequencies((y_orig[test], l_enc))
        y_pred = cnn.model.predict(X[test])
        y_pred = probas_to_classes(y_pred)
        c = Counter(y_pred)
        total = float(len(y_pred))
        print('\nLabel frequencies in predict(y[test])')
        for label, count in c.most_common():
            print l_enc.inverse_transform(label), count, count / total
        print "fold %i/10 - time: %.2f s - acc: %.4f on %i samples" % \
            (i+1, train_time, acc, len(test))
    print "Avg cv accuracy: %.4f" % np.mean(cv_scores)
Esempio n. 18
0
from keras.utils.np_utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.utils.layer_utils import print_summary

sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
from preprocessing.data_utils import (
    load_bin_vec,
    sentences_df,
    add_unknown_words,
    load_dataset,
)

word_vecs = sys.argv[1]
captions_file = sys.argv[2]

df = sentences_df(labels='full', drop_unk=True)
X, y, word2idx, l_enc = load_dataset(df, ngram_order=1, pad=True)

elc_sents = set(list(df.sentence))
flickr_sents = []
with open('../../results_20130124.token') as f:
    for line in f:
        line = line.strip().lower().split('\t')[-1]
        if line not in elc_sents:
            flickr_sents.append(line)

idx = len(word2idx) + 1
for sent in flickr_sents:
    for word in re.split("-| ", sent):
        if word not in word2idx:
            word2idx[word] = idx
Esempio n. 19
0
                  metrics=['accuracy'])

    return model


if __name__ == '__main__':
    argparser = argparse.ArgumentParser()
    argparser.add_argument('--train', type=str)
    argparser.add_argument('--val', type=str)
    argparser.add_argument('--test', type=str)
    argparser.add_argument('--wordvecs', type=str)
    argparser.add_argument('--sentmodel', type=str)
    args = argparser.parse_args()

    print "Loading data... ",
    df = sentences_df(labels='full', drop_unk=True, keep_filename=True)
    train_samples = load_from_json(args.train)
    val_samples = load_from_json(args.val)
    test_samples = load_from_json(args.test)
    samples = [train_samples, val_samples, test_samples]
    vocab, word2idx = build_vocab(samples)
    vocab_size = len(vocab)
    maxlen = max(
        max(len(preprocess(sample['sentence1'])),
            len(preprocess(sample['sentence2']))) for split in samples
        for sample in split)
    l_enc = LabelEncoder()

    X_train, y_train, l_enc = vectorize(train_samples, word2idx, l_enc, df,
                                        maxlen)
    X_val, y_val, _ = vectorize(val_samples,