def train_test_bow(ngram_order, batch_size=128, n_epoch=3): label_sets = ['full', 'function', '3way', 'in_out', 'man_nat'] for label_set in label_sets: # need to drop unk for full/function if label_set in ['full', 'function']: df = sentences_df(labels=label_set, drop_unk=True) else: df = sentences_df(SENTENCES_CSV, labels=label_set, drop_unk=False) X, y, word2idx, l_enc = load_dataset(df, ngram_order=ngram_order) print "X shape: %s" % (X.shape,) print "y shape: %s" % (y.shape,) skf = StratifiedKFold(y, n_folds=10, shuffle=True, random_state=0) scores = [] for (train, test) in skf: clf = None clf = SGDClassifier(loss='log', alpha=0.001, l1_ratio=0, random_state=0) for epoch in range(n_epoch): X_train, y_train, X_test, y_test = X[train], y[train], X[test], y[test] n_batches = X_train.shape[0] // batch_size for minibatch_idx in range(n_batches): clf.partial_fit( X_train[minibatch_idx * batch_size : (minibatch_idx+1) * batch_size], y_train[minibatch_idx * batch_size : (minibatch_idx+1) * batch_size], classes=np.unique(y)) print "Epoch: %d/%d Train acc: %.4f" \ % (epoch+1, n_epoch, clf.score(X_train, y_train)) fold_score = clf.score(X_test, y_test) print "Fold acc: %.4f" % fold_score scores.append(fold_score) print '%s label mean cv accuracy: %.4f\n' % (label_set, np.mean(scores))
def train(label_set='full', pool_mode='sum', layer_sizes=[512, 256], activation='prelu', drop_unk=False, word_vecs=None, return_net=False, cv=10, val_split=0.00, label_unk=False): print "Loading data..." df = sentences_df(SENTENCES_CSV, labels=label_set, drop_unk=drop_unk) X, y, word2idx, l_enc = load_dataset(df, ngram_order=1, pad=True) print "X shape:", X.shape y_orig = y y_binary = to_categorical(y) labels = np.unique(y_orig) nb_labels = labels.shape[0] if drop_unk: label_set_str = label_set + ' (-unk)' else: label_set_str = label_set print "Number of labels: %i [%s]" % (nb_labels, label_set_str) if nb_labels > 2: y = y_binary maxlen = X.shape[1] vocab_size = len(word2idx) + 1 # 0 masking word_vectors = load_bin_vec(word_vecs, word2idx) add_unknown_words(word_vectors, word2idx) embedding_weights = np.zeros((vocab_size+1, emb_dim)) for word, index in word2idx.items(): embedding_weights[index,:] = word_vectors[word]
def main(rnn_layer='lstm', word_vecs=None): print "Loading data...", df = sentences_df(SENTENCES_CSV) X, y, word2idx, l_enc = load_dataset(df, pad=True) print X.shape y_binary = to_categorical(y) word_vectors = load_bin_vec(word_vecs, word2idx) add_unknown_words(word_vectors, word2idx) print "Data loaded." labels = np.unique(y) n_labels = labels.shape[0] max_len = X.shape[1] vocab_dim = 300 n_vocab = len(word2idx) + 1 # 0 masking embedding_weights = np.zeros((n_vocab+1, vocab_dim)) for word, index in word2idx.items(): embedding_weights[index,:] = word_vectors[word] skf = StratifiedKFold(y, n_folds=10, shuffle=True, random_state=0) cv_scores = [] for i, (train, test) in enumerate(skf): start_time = time.time() model = create_model(n_vocab, n_labels, vocab_dim, embedding_weights, rnn_layer=rnn_layer) if i == 0: print_summary(model.layers) _, score = train_and_test_model(model, X[train], y_binary[train], X[test], y_binary[test]) cv_scores.append(score) train_time = time.time() - start_time print "fold %i/10 - time: %.2f s - acc: %.4f on %i samples" % \ (i+1, train_time, score, len(test)) print "avg cv acc: %.4f" % np.mean(cv_scores)
def train(label_set='full', drop_unk=False, word_vecs=None, setup_only=False, layer_sizes=[512,256], pool_mode='sum'): print "Loading data..." df = sentences_df(SENTENCES_CSV, labels=label_set, drop_unk=drop_unk) X, y, word2idx, l_enc = load_dataset(df, pad=True) print "X shape:", X.shape y_orig = y y_binary = to_categorical(y) labels = np.unique(y_orig) nb_labels = labels.shape[0] if drop_unk: label_set_str = label_set + ' (-unk)' else: label_set_str = label_set print "Number of labels: %i [%s]" % (nb_labels, label_set_str) if nb_labels > 2: y = y_binary maxlen = X.shape[1] vocab_size = len(word2idx) + 1 # 0 masking if pretrained_embeddings is True: word_vectors = load_bin_vec(word_vecs, word2idx) add_unknown_words(word_vectors, word2idx) embedding_weights = np.zeros((vocab_size+1, emb_dim)) for word, index in word2idx.items(): embedding_weights[index,:] = word_vectors[word] else: embedding_weights = None print "Data loaded." skf = StratifiedKFold(y_orig, n_folds=10, shuffle=True, random_state=0) cv_scores = [] for i, (train, test) in enumerate(skf): start_time = time.time() nn = None nn = EnsembleNN(vocab_size, nb_labels, emb_dim, maxlen, embedding_weights, filter_hs, nb_filters, dropout_p, trainable_embeddings, pretrained_embeddings, layer_sizes, pool_mode) if i == 0: print_summary(nn.model.layers) acc = train_and_test_model(nn, X[train], y[train], X[test], y[test], batch_size, nb_epoch, lr, beta_1, beta_2, epsilon) cv_scores.append(acc) train_time = time.time() - start_time print('\nLabel frequencies in y[test]') print_label_frequencies((y_orig[test], l_enc)) y_pred = nn.model.predict(X[test]) y_pred = probas_to_classes(y_pred) c = Counter(y_pred) total = float(len(y_pred)) print('\nLabel frequencies in predict(y[test])') for label, count in c.most_common(): print l_enc.inverse_transform(label), count, count / total print "fold %i/10 - time: %.2f s - acc: %.4f on %i samples" % \ (i+1, train_time, acc, len(test)) print "Avg cv accuracy: %.4f" % np.mean(cv_scores)
def train_and_test_nb(ngram_order=1): print "ngram order:", ngram_order label_sets = ['full', 'function', '3way', 'in_out', 'man_nat'] for label_set in label_sets: df = sentences_df(SENTENCES_CSV, labels=label_set) X, y, word2idx, l_enc = load_dataset(df, ngram_order=ngram_order) print "X shape: %s" % (X.shape,) print "y shape: %s" % (y.shape,) clf = BernoulliNB() scores = cross_val_score(clf, X, y, cv=10) print_label_frequencies(df) print '%s label mean cv accuracy: %.4f\n' % (label_set, np.mean(scores))
def train_and_test_nb(ngram_order=1): print "ngram order:", ngram_order label_sets = ['full', 'function', '3way', 'in_out', 'man_nat'] for label_set in label_sets: df = sentences_df(SENTENCES_CSV, labels=label_set) X, y, word2idx, l_enc = load_dataset(df, ngram_order=ngram_order) print "X shape: %s" % (X.shape, ) print "y shape: %s" % (y.shape, ) clf = BernoulliNB() scores = cross_val_score(clf, X, y, cv=10) print_label_frequencies(df) print '%s label mean cv accuracy: %.4f\n' % (label_set, np.mean(scores))
def train_test_bow(ngram_order, batch_size=128, n_epoch=3): label_sets = ['full', 'function', '3way', 'in_out', 'man_nat'] for label_set in label_sets: # need to drop unk for full/function if label_set in ['full', 'function']: df = sentences_df(labels=label_set, drop_unk=True) else: df = sentences_df(SENTENCES_CSV, labels=label_set, drop_unk=False) X, y, word2idx, l_enc = load_dataset(df, ngram_order=ngram_order) print "X shape: %s" % (X.shape, ) print "y shape: %s" % (y.shape, ) skf = StratifiedKFold(y, n_folds=10, shuffle=True, random_state=0) scores = [] for (train, test) in skf: clf = None clf = SGDClassifier(loss='log', alpha=0.001, l1_ratio=0, random_state=0) for epoch in range(n_epoch): X_train, y_train, X_test, y_test = X[train], y[train], X[ test], y[test] n_batches = X_train.shape[0] // batch_size for minibatch_idx in range(n_batches): clf.partial_fit( X_train[minibatch_idx * batch_size:(minibatch_idx + 1) * batch_size], y_train[minibatch_idx * batch_size:(minibatch_idx + 1) * batch_size], classes=np.unique(y)) print "Epoch: %d/%d Train acc: %.4f" \ % (epoch+1, n_epoch, clf.score(X_train, y_train)) fold_score = clf.score(X_test, y_test) print "Fold acc: %.4f" % fold_score scores.append(fold_score) print '%s label mean cv accuracy: %.4f\n' % (label_set, np.mean(scores))
def main(rnn_layer='lstm', word_vecs=None): print "Loading data...", df = sentences_df(SENTENCES_CSV) X, y, word2idx, l_enc = load_dataset(df, pad=True) print X.shape y_binary = to_categorical(y) word_vectors = load_bin_vec(word_vecs, word2idx) add_unknown_words(word_vectors, word2idx) print "Data loaded." labels = np.unique(y) n_labels = labels.shape[0] max_len = X.shape[1] vocab_dim = 300 n_vocab = len(word2idx) + 1 # 0 masking embedding_weights = np.zeros((n_vocab + 1, vocab_dim)) for word, index in word2idx.items(): embedding_weights[index, :] = word_vectors[word] skf = StratifiedKFold(y, n_folds=10, shuffle=True, random_state=0) cv_scores = [] for i, (train, test) in enumerate(skf): start_time = time.time() model = create_model(n_vocab, n_labels, vocab_dim, embedding_weights, rnn_layer=rnn_layer) if i == 0: print_summary(model.layers) _, score = train_and_test_model(model, X[train], y_binary[train], X[test], y_binary[test]) cv_scores.append(score) train_time = time.time() - start_time print "fold %i/10 - time: %.2f s - acc: %.4f on %i samples" % \ (i+1, train_time, score, len(test)) print "avg cv acc: %.4f" % np.mean(cv_scores)
from keras.preprocessing.sequence import pad_sequences from keras.utils.layer_utils import print_summary sys.path.append(os.path.join(os.path.dirname(__file__), '..')) from preprocessing.data_utils import ( load_bin_vec, sentences_df, add_unknown_words, load_dataset, ) word_vecs = sys.argv[1] captions_file = sys.argv[2] df = sentences_df(labels='full', drop_unk=True) X, y, word2idx, l_enc = load_dataset(df, ngram_order=1, pad=True) elc_sents = set(list(df.sentence)) flickr_sents = [] with open('../../results_20130124.token') as f: for line in f: line = line.strip().lower().split('\t')[-1] if line not in elc_sents: flickr_sents.append(line) idx = len(word2idx) + 1 for sent in flickr_sents: for word in re.split("-| ", sent): if word not in word2idx: word2idx[word] = idx idx += 1
scene_model = args.scenes df = sentences_df(keep_filename=True, special_tokens=True, drop_unk=True) imlookup = {} for imfile in set(df.img_file): impath = os.path.join(img_directory, imfile) im = scipy.misc.imread(impath) im = preprocess_im(im) imlookup[imfile] = im # X sentences should have the start token but not the end token, # since they'll be the input to the RNN. # Y sentences should have the end token, but not the start token, # since they'll be the predictions given by the RNN. print "Loading captions..." X, scene_labels, word2idx, l_enc = load_dataset(df, pad=True) with open('../../models/word2idx.json') as j: word2idx = json.load(j) with open('../../models/l_enc.pkl') as p: l_enc = pickle.load(p) # partial captions are word 0..n # next word is word n+1 # create samples for n=1...N-1 partial_captions = [] next_words = [] imfiles = [] scenes = [] for i, x_i in enumerate(X): indices = x_i[np.nonzero(x_i)[0]] for j in range(len(indices) - 1):
def train(model_type='parallel', label_set='full', drop_unk=False, word_vecs=None, setup_only=False): print "Loading data..." df = sentences_df(SENTENCES_CSV, labels=label_set, drop_unk=drop_unk) X, y, word2idx, l_enc = load_dataset(df, pad=True) print "X shape:", X.shape y_orig = y y_binary = to_categorical(y) labels = np.unique(y_orig) nb_labels = labels.shape[0] if drop_unk: label_set_str = label_set + ' (-unk)' else: label_set_str = label_set print "Number of labels: %i [%s]" % (nb_labels, label_set_str) if nb_labels > 2: y = y_binary maxlen = X.shape[1] vocab_size = len(word2idx) + 1 # 0 masking if pretrained_embeddings is True: word_vectors = load_bin_vec(word_vecs, word2idx) add_unknown_words(word_vectors, word2idx) embedding_weights = np.zeros((vocab_size + 1, emb_dim)) for word, index in word2idx.items(): embedding_weights[index, :] = word_vectors[word] else: embedding_weights = None print "Data loaded." if setup_only: cnn = create_model(vocab_size, nb_labels, emb_dim, maxlen, embedding_weights, filter_hs, nb_filters, dropout_p, trainable_embeddings, pretrained_embeddings, model_type=model_type) return { 'X': X, 'y': y, 'word2idx': word2idx, 'l_enc': l_enc, 'y_binary': y_binary, 'labels': labels, 'nb_labels': nb_labels, 'maxlen': maxlen, 'emb_dim': emb_dim, 'vocab_size': vocab_size, 'embedding_weights': embedding_weights, 'cnn': cnn } params = [('filter_hs', filter_hs), ('nb_filters', nb_filters), ('dropout_p', dropout_p), ('trainable_embeddings', trainable_embeddings), ('pretrained_embeddings', pretrained_embeddings), ('batch_size', batch_size), ('nb_epoch', nb_epoch), ('lr', lr), ('beta_1', beta_1), ('beta_2', beta_2), ('epsilon', epsilon)] print "\nModel type: %s" % model_type for (name, value) in params: print name + ':', value skf = StratifiedKFold(y_orig, n_folds=10, shuffle=True, random_state=0) cv_scores = [] for i, (train, test) in enumerate(skf): start_time = time.time() cnn = None cnn = create_model(vocab_size, nb_labels, emb_dim, maxlen, embedding_weights, filter_hs, nb_filters, dropout_p, trainable_embeddings, pretrained_embeddings, model_type=model_type) if i == 0: print_summary(cnn.model.layers) acc = train_and_test_model(cnn, X[train], y[train], X[test], y[test], batch_size, nb_epoch, lr, beta_1, beta_2, epsilon) cv_scores.append(acc) train_time = time.time() - start_time print('\nLabel frequencies in y[test]') print_label_frequencies((y_orig[test], l_enc)) y_pred = cnn.model.predict(X[test]) y_pred = probas_to_classes(y_pred) c = Counter(y_pred) total = float(len(y_pred)) print('\nLabel frequencies in predict(y[test])') for label, count in c.most_common(): print l_enc.inverse_transform(label), count, count / total print "fold %i/10 - time: %.2f s - acc: %.4f on %i samples" % \ (i+1, train_time, acc, len(test)) print "Avg cv accuracy: %.4f" % np.mean(cv_scores)
def train(model_type='parallel', label_set='full', drop_unk=False, word_vecs=None, setup_only=False): print "Loading data..." df = sentences_df(SENTENCES_CSV, labels=label_set, drop_unk=drop_unk) X, y, word2idx, l_enc = load_dataset(df, pad=True) print "X shape:", X.shape y_orig = y y_binary = to_categorical(y) labels = np.unique(y_orig) nb_labels = labels.shape[0] if drop_unk: label_set_str = label_set + ' (-unk)' else: label_set_str = label_set print "Number of labels: %i [%s]" % (nb_labels, label_set_str) if nb_labels > 2: y = y_binary maxlen = X.shape[1] vocab_size = len(word2idx) + 1 # 0 masking if pretrained_embeddings is True: word_vectors = load_bin_vec(word_vecs, word2idx) add_unknown_words(word_vectors, word2idx) embedding_weights = np.zeros((vocab_size+1, emb_dim)) for word, index in word2idx.items(): embedding_weights[index,:] = word_vectors[word] else: embedding_weights = None print "Data loaded." if setup_only: cnn = create_model(vocab_size, nb_labels, emb_dim, maxlen, embedding_weights, filter_hs, nb_filters, dropout_p, trainable_embeddings, pretrained_embeddings, model_type=model_type) return {'X': X, 'y': y, 'word2idx': word2idx, 'l_enc': l_enc, 'y_binary': y_binary, 'labels': labels, 'nb_labels': nb_labels, 'maxlen': maxlen, 'emb_dim': emb_dim, 'vocab_size': vocab_size, 'embedding_weights': embedding_weights, 'cnn': cnn} params = [('filter_hs',filter_hs), ('nb_filters',nb_filters), ('dropout_p',dropout_p), ('trainable_embeddings',trainable_embeddings), ('pretrained_embeddings',pretrained_embeddings), ('batch_size',batch_size), ('nb_epoch',nb_epoch), ('lr',lr), ('beta_1',beta_1), ('beta_2',beta_2), ('epsilon',epsilon)] print "\nModel type: %s" % model_type for (name, value) in params: print name + ':', value skf = StratifiedKFold(y_orig, n_folds=10, shuffle=True, random_state=0) cv_scores = [] for i, (train, test) in enumerate(skf): start_time = time.time() cnn = None cnn = create_model(vocab_size, nb_labels, emb_dim, maxlen, embedding_weights, filter_hs, nb_filters, dropout_p, trainable_embeddings, pretrained_embeddings, model_type=model_type) if i == 0: print_summary(cnn.model.layers) acc = train_and_test_model(cnn, X[train], y[train], X[test], y[test], batch_size, nb_epoch, lr, beta_1, beta_2, epsilon) cv_scores.append(acc) train_time = time.time() - start_time print('\nLabel frequencies in y[test]') print_label_frequencies((y_orig[test], l_enc)) y_pred = cnn.model.predict(X[test]) y_pred = probas_to_classes(y_pred) c = Counter(y_pred) total = float(len(y_pred)) print('\nLabel frequencies in predict(y[test])') for label, count in c.most_common(): print l_enc.inverse_transform(label), count, count / total print "fold %i/10 - time: %.2f s - acc: %.4f on %i samples" % \ (i+1, train_time, acc, len(test)) print "Avg cv accuracy: %.4f" % np.mean(cv_scores)
word_vectors[idx2word[randidx]]) >>>>>>> 30acde2a62894bcba1348418b740e3c458303c27 print "Data loaded." params = [('batch_size',batch_size), ('nb_epoch',nb_epoch), ('lr',lr), ('beta_1',beta_1), ('beta_2',beta_2), ('epsilon',epsilon)] for (name, value) in params: print name + ':', value if label_unk: unk_df = pd.read_csv(SENTENCES_CSV) unk_df = unk_df[(unk_df.q3 == 'other_unclear') | (unk_df.q4 == 'other_unclear')] df2 = create_unk_labeled_instances(unk_df) df3 = sentences_df(label_unk=df2) X_unk, y_unk, _, _ = load_dataset(df3, pad=True) X_train, X_test, y_train, y_test = train_test_split( X, y, stratify=y_orig, test_size=0.2, random_state=0) print "Training and testing without labeled unknown on %i samples" % len(X_train) nn1 = FeedforwardNN(vocab_size, nb_labels, emb_dim, maxlen, layer_sizes, activation, embedding_weights, pool_mode=pool_mode) _, acc = train_and_test_model(nn1, X_train, y_train, X_test, y_test,
df = sentences_df(keep_filename=True, special_tokens=True, drop_unk=True) # sort df by ImageCollection ordering img_order = [f.split('/')[-1] for f in ic.files] df['img_file_ord'] = pd.Categorical( df['img_file'], categories=img_order, ordered=True) df = df.sort_values(by='img_file_ord') # X sentences should have the start token but not the end token, # since they'll be the input to the RNN. # Y sentences should have the end token, but not the start token, # since they'll be the predictions given by the RNN. print "Loading captions..." _, _, word2idx, _ = load_dataset(df, pad=True, truncate=True) Xdf = df.copy() ydf = df.copy() Xdf['sentence'] = Xdf['sentence'].apply(lambda x: ' '.join(x.split()[:-1])) ydf['sentence'] = ydf['sentence'].apply(lambda x: ' '.join(x.split()[1:])) X_sents, _, _, _ = load_dataset(Xdf, pad=True, word2idx=word2idx) y_sents, _, _, _ = load_dataset(ydf, pad=True, word2idx=word2idx) scene_labels = Xdf.label vocab_size = len(word2idx) + 1 max_caption_len = X_sents.shape[1] ims_train, ims_test = ic[:4000], ic[4000:] X_sents_train, y_sents_train = X_sents[:20000], y_sents[:20000] X_sents_test, y_sents_test = X_sents[20000:], y_sents[20000:]
scene_model = args.scenes df = sentences_df(keep_filename=True, special_tokens=True, drop_unk=True) imlookup = {} for imfile in set(df.img_file): impath = os.path.join(img_directory, imfile) im = scipy.misc.imread(impath) im = preprocess_im(im) imlookup[imfile] = im # X sentences should have the start token but not the end token, # since they'll be the input to the RNN. # Y sentences should have the end token, but not the start token, # since they'll be the predictions given by the RNN. print "Loading captions..." X, scene_labels, word2idx, l_enc = load_dataset(df, pad=True) with open('../../models/word2idx.json') as j: word2idx = json.load(j) with open('../../models/l_enc.pkl') as p: l_enc = pickle.load(p) # partial captions are word 0..n # next word is word n+1 # create samples for n=1...N-1 partial_captions = [] next_words = [] imfiles = [] scenes = [] for i, x_i in enumerate(X): indices = x_i[np.nonzero(x_i)[0]] for j in range(len(indices)-1):
load_bin_vec, add_unknown_words ) from paths import GENERATED_TEXT def log(content, outfile): """Write to stdout and outfile. outfile is open file""" outfile.write(content) sys.stdout.write(content) # load sentences df = sentences_df(labels='function') labels = np.unique(df.label) text = '\n'.join(s for s in df.sentence.values).split() sents, _, word2idx, l_enc = load_dataset(df, pad=True) idx2word = {i: w for w,i in word2idx.items()} maxlen = sents.shape[1] vocab = word2idx.keys() # cut X into sequences of 10 words # given 9 words, predict 10th seqlen = 9 step = 2 sentences = [] next_words = [] mask = np.nonzero(sents) text = sents[mask].astype(np.int32) for i in range(0, len(text) - seqlen, step): sentences.append(text[i: i + seqlen]) next_words.append(text[i + seqlen])