def main(): usage = "%prog project documents.json" parser = OptionParser(usage=usage) parser.add_option('-a', dest='alpha', default=0.00001, help='Regularization strength: default=%default') parser.add_option('-d', dest='hidden_dim', default=50, help='Hidden node dimension: default=%default') parser.add_option('-e', dest='epochs', default=10, help='Number of epochs: default=%default') parser.add_option('-i', dest='iter_display', default=5000, help='Number of iterations between output: default=%default') parser.add_option('-o', dest='optimization', default='sgd', help='Optimization method [sgd|sgdm|adagrad]: default=%default') parser.add_option('-l', dest='learning_rate', default=0.1, help='Initial learning rate: default=%default') parser.add_option('--decay', dest='decay', default=1.00, help='Learning rate decay: default=%default') parser.add_option('--momentum', dest='momentum', default=0.5, help='Momentum parameter (sgdm only): default=%default') parser.add_option('--word2vec_file', dest='word2vec_file', default='', help='Location of word2vec file: default=do not load') parser.add_option('--glove_file', dest='glove_file', default='', help='Location of glove file: default=do not load') parser.add_option('--save_vectors', action="store_true", dest="save_vectors", default=False, help='Save loaded vectors for faster loading next time: default=%default') parser.add_option('-s', dest='seed', default=42, help='Random seed: default=%default') parser.add_option('--no_eval', action="store_true", dest="no_eval", default=False, help='Skip the evaluation between epochs: default=%default') parser.add_option('--test_fold', dest='test_fold', default=0, help='Test fold: default=%default') parser.add_option('--dev_fold', dest='dev_fold', default=0, help='Dev fold: default=%default') parser.add_option('--n_labels', dest='n_labels', default=14, help='Number of labels to use (max 15): default=%default') parser.add_option('--w_word', dest='w_word', default=1.0, help='Weight on word prediction: default=%default') parser.add_option('--w_sentence', dest='w_sentence', default=1.0, help='Weight on word prediction: default=%default') parser.add_option('--w_article', dest='w_article', default=1.0, help='Weight on word prediction: default=%default') (options, args) = parser.parse_args() project_name = args[0] input_filename = args[1] dirs.make_base_dir(project_name) sents_dir = dirs.data_raw_sentences_dir seed = int(options.seed) n_epochs = int(options.epochs) alpha = float(options.alpha) lr = float(options.learning_rate) iter_display = int(options.iter_display) opti_method = options.optimization lr_decay = float(options.decay) momentum = float(options.momentum) no_eval = options.no_eval word2vec_file = options.word2vec_file glove_file = options.glove_file save_vectors = options.save_vectors test_fold = int(options.test_fold) dev_fold = int(options.dev_fold) n_labels = int(options.n_labels) w_word = float(options.w_word) w_sentence = float(options.w_sentence) w_article = float(options.w_article) if seed > 0: np.random.seed(seed) random.seed(seed) dh = int(options.hidden_dim) dx = 300 np.__config__.show() article_sent_words, article_word_labels, vocab, n_labels, n_unique_articles, annotation_counts = load_data(input_filename, n_labels) train_keys, dev_keys, test_keys = ds.get_all_splits(test_fold=test_fold, dev_subfold=dev_fold) vocab = vocab.keys() vocab.sort() vocab_size = len(vocab) vocab_index = dict(zip(vocab, range(vocab_size))) print "Vocab size =", vocab_size n_articles = len(article_sent_words) keys = article_sent_words.keys() keys.sort() print keys[:10] print "Loaded %d annotations for %d articles using %d labels" % (n_articles, n_unique_articles, n_labels) print list(train_keys)[:10] train_keys = [k for k in keys if k.split('__')[0] in train_keys] dev_keys = [k for k in keys if k.split('__')[0] in dev_keys] test_keys = [k for k in keys if k.split('__')[0] in test_keys] #dev_indices = np.random.choice(n_articles, n_dev, replace=False).tolist() #train_indices = list(set(range(n_articles)) - set(dev_indices)) #train_keys = [keys[i] for i in train_indices] #dev_keys = [keys[i] for i in dev_indices] if glove_file != '': initial_embeddings = vector_utils.load_glove_vectors(glove_file, vocab, dx) elif word2vec_file != '': initial_embeddings = vector_utils.load_word2vec_vectors(word2vec_file, vocab, dx) else: initial_embeddings, vocab, vocab_index = vector_utils.load_from_file(input_filename) vocab_size = len(vocab) if save_vectors: vector_utils.save_vectors(input_filename, initial_embeddings, vocab) # index words into vocabulary and make mask and label arrays idxs_dict = {} mask_dict = {} label_dict = {} for key, sent_words in article_sent_words.items(): n_sents = len(sent_words) max_len = max([len(s) for s in sent_words]) word_idxs = np.zeros([max_len, n_sents], dtype=np.int32) mask = np.zeros([max_len, n_sents], dtype=np.int32) labels = np.zeros([max_len, n_sents, n_labels], dtype=np.int32) for s_i, s in enumerate(sent_words): n_words = len(s) word_idxs[:n_words, s_i] = [vocab_index[w] for w in s] mask[:n_words, s_i] = 1 labels[:n_words, s_i, :] = article_word_labels[key][s_i][:, :] idxs_dict[key] = word_idxs mask_dict[key] = mask label_dict[key] = labels article_lengths = [(idxs_dict[k].size, k) for k in train_keys] article_lengths.sort() # create the LSTM theano_seed = np.random.randint(2 ** 30) print "Number of distributions =", 2 print "Building RNN" optimizer, opti_params = get_optimizer(opti_method, momentum) bilstm = BiLSTM(vocab_size, dh, dx, n_labels, optimizer, opti_params, initial_embeddings=initial_embeddings, alpha=alpha, update=opti_method, seed=theano_seed, momentum=momentum, word_weight=w_word, sent_weight=w_sentence, article_weight=w_article) # create RNN best_dev_f1 = np.zeros(n_labels) corr_test_f1 = np.zeros(n_labels) print "Training" for epoch in range(n_epochs): sum_log_loss = 0 sum_loss = 0 mistakes = 0 # sort by keys on the first pass, then shuffle if epoch == 0: keys = [key for length, key in article_lengths] else: keys = train_keys random.shuffle(keys) print "epoch\titems\tloss\tl+reg\terrs" # consider each sentence in turn for k_i, k in enumerate(keys): idxs = idxs_dict[k] mask = mask_dict[k] word_labels = label_dict[k] p_word_labels, p_sent_labels, p_article_labels, log_loss, loss = bilstm.train(idxs, mask, word_labels, lr, 1) sum_log_loss += log_loss sum_loss += loss y_pred_words = np.array(p_word_labels > 0.5, dtype=int) # (n_words, n_sents, n_labels) y_pred_sents = np.array(p_sent_labels > 0.5, dtype=int) y_pred_article = np.array(p_article_labels > 0.5, dtype=int) sent_labels = np.max(word_labels, axis=0) article_labels = np.max(sent_labels, axis=0) mistakes += np.sum(np.abs(article_labels - y_pred_article))/float(n_labels) to_print = False if k_i == 0 and to_print: print "\tTraining example:", k print article_labels print np.array(y_pred_article, dtype=int) max_len, n_sents = mask.shape for s_i in range(n_sents): if np.max(y_pred_words[:, s_i, :]) == 1: n_words = np.argmin(mask[:, s_i]) - 1 sentence = [vocab[c] for c in idxs[:n_words, s_i]] print "Full:", k_i, ' '.join(sentence) for code in range(n_labels): if y_pred_sents[s_i, code] == 1: highlight = [w if word_labels[w_i, s_i, code] else ' ' * len(w) for w_i, w in enumerate(sentence)] print '-------------------------------------' print "True:", k_i, code, ' '.join(highlight) highlight = [w if y_pred_words[w_i, s_i, code] else ' ' * len(w) for w_i, w in enumerate(sentence)] #highlight = [vocab[c][1:2] if (p_y_given_x[c_i, code] > 0.5 or vocab[c][1:2] == '\n') else ' ' for c_i, c in enumerate(idxs)] print '-------------------------------------' print "Pred:", k_i, code, ' '.join(highlight) print "" if k_i % iter_display == 0 and k_i > 0: d = float(k_i+1) print '%d\t%d\t%.4f\t%.4f\t%.4f' % \ (epoch, k_i, sum_log_loss/d, sum_loss/d, mistakes/d) if not no_eval: print "\nDev evaluation" valid_z_o_loss, valid_log_loss, valid_f1, valid_per_class_f1 = evaluate(idxs_dict, mask_dict, label_dict, dev_keys, bilstm, vocab, annotation_counts) print "\nTest evaluation" test_z_o_loss, test_log_loss, test_f1, test_per_class_f1 = evaluate(idxs_dict, mask_dict, label_dict, test_keys, bilstm, vocab, annotation_counts) print ('epoch=%d\tdev_log_loss=%.3f\tdev_0/1=%.3f\tdev_f1=%.3f\ttest_log_loss=%.3f\ttest_0/1=%.3f\ttest_f1=%.3f\t') % (epoch, valid_log_loss, valid_z_o_loss, valid_f1, test_log_loss, test_z_o_loss, test_f1) for k in range(n_labels): if valid_per_class_f1[k] > best_dev_f1[k]: best_dev_f1[k] = valid_per_class_f1[k] corr_test_f1[k] = test_per_class_f1[k] print "Best valid f1s:", best_dev_f1 print "Corr. test f1s:", corr_test_f1 # decay learning rate lr *= lr_decay
def main(): usage = "%prog project documents.json" parser = OptionParser(usage=usage) parser.add_option('-a', dest='alpha', default=0.00001, help='Regularization strength: default=%default') parser.add_option('-g', dest='gamma', default=0.5, help='Gamma (proportional weight on words): default=%default') parser.add_option('-d', dest='hidden_dim', default=50, help='Hidden node dimension: default=%default') parser.add_option('-e', dest='epochs', default=20, help='Number of epochs: default=%default') parser.add_option('-i', dest='iter_display', default=5000, help='Number of iterations between output: default=%default') parser.add_option('-o', dest='optimization', default='adagrad', help='Optimization method [sgd|sgdm|adagrad]: default=%default') parser.add_option('-l', dest='learning_rate', default=0.05, help='Initial learning rate: default=%default') parser.add_option('--decay', dest='decay', default=1.00, help='Learning rate decay: default=%default') parser.add_option('--momentum', dest='momentum', default=0.5, help='Momentum parameter (sgdm only): default=%default') parser.add_option('--word2vec_file', dest='word2vec_file', default='', help='Location of word2vec file: default=do not load') parser.add_option('--glove_file', dest='glove_file', default='', help='Location of glove file: default=do not load') parser.add_option('--save_vectors', action="store_true", dest="save_vectors", default=False, help='Save loaded vectors for faster loading next time: default=%default') parser.add_option('-s', dest='seed', default=42, help='Random seed: default=%default') parser.add_option('--no_eval', action="store_true", dest="no_eval", default=False, help='Skip the evaluation between epochs: default=%default') parser.add_option('--n_dev', dest='n_dev', default=500, help='Number of random dev sentences: default=%default') (options, args) = parser.parse_args() project_name = args[0] input_filename = args[1] dirs.make_base_dir(project_name) sents_dir = dirs.data_raw_sentences_dir seed = int(options.seed) n_epochs = int(options.epochs) alpha = float(options.alpha) gamma = float(options.gamma) lr = float(options.learning_rate) iter_display = int(options.iter_display) opti_method = options.optimization lr_decay = float(options.decay) momentum = float(options.momentum) no_eval = options.no_eval n_dev = int(options.n_dev) word2vec_file = options.word2vec_file glove_file = options.glove_file save_vectors = options.save_vectors if seed > 0: np.random.seed(seed) random.seed(seed) dh = int(options.hidden_dim) dx = 300 nd = 2 np.__config__.show() sents, word_labels, vocab, n_labels = load_data(input_filename) vocab = vocab.keys() vocab.sort() vocab_size = len(vocab) vocab_index = dict(zip(vocab, range(vocab_size))) print "Vocab size =", vocab_size n_articles = len(sents) print "Loaded", n_articles, "annotated sentences" dev_indices = np.random.choice(n_articles, n_dev, replace=False).tolist() train_indices = list(set(range(n_articles)) - set(dev_indices)) n_train = len(train_indices) n_dev = len(dev_indices) if glove_file != '': initial_embeddings = vector_utils.load_glove_vectors(glove_file, vocab, dx) elif word2vec_file != '': initial_embeddings = vector_utils.load_word2vec_vectors(word2vec_file, vocab, dx) else: initial_embeddings, vocab, vocab_index = vector_utils.load_from_file(input_filename) vocab_size = len(vocab) if save_vectors: vector_utils.save_vectors(input_filename, initial_embeddings, vocab) idxs_dict = {} for k in range(n_articles): words = sents[k] idxs_dict[k] = np.array([vocab_index[w] for w_i, w in enumerate(words)], dtype=np.int32) sent_lengths = [(len(idxs_dict[k]), k) for k in train_indices] sent_lengths.sort() # create the LSTM theano_seed = np.random.randint(2 ** 30) print "Number of distributions =", 2 print "Building RNN" bilstm = BiLSTM(vocab_size, dh, dx, n_labels, initial_embeddings=initial_embeddings, alpha=alpha, update=opti_method, seed=theano_seed, momentum=momentum, gamma=gamma) # create RNN print "Training" for epoch in range(n_epochs): sum_log_loss = 0 sum_loss = 0 mistakes = 0 # sort by keys on the first pass, then shuffle if epoch == 0: keys = [key for length, key in sent_lengths] else: keys = train_indices random.shuffle(keys) print "epoch\titems\tloss\tl+reg\terrs" # consider each sentence in turn for k_i, k in enumerate(keys): text = sents[k] idxs = idxs_dict[k] codes = word_labels[k] y_pred, y_pred_max, log_loss, loss = bilstm.train(np.array(idxs, dtype=np.int32), codes, lr, 1) sum_log_loss += log_loss sum_loss += loss codes_max = np.max(codes, axis=0) mistakes += np.sum(np.abs(codes_max - y_pred_max))/float(len(codes_max)) if k_i == -1: print "Training example:" sentence = [vocab[c] for c in idxs] print k_i, ' '.join(sentence) print codes_max print y_pred_max for code in range(len(y_pred_max)): if y_pred_max[code] == 1: highlight = [w if codes[w_i, code] else ' ' * len(w) for w_i, w in enumerate(sentence)] print '-------------------------------------' print "True:", k_i, code, ' '.join(highlight) highlight = [w if y_pred[w_i, code] else ' ' * len(w) for w_i, w in enumerate(sentence)] #highlight = [vocab[c][1:2] if (p_y_given_x[c_i, code] > 0.5 or vocab[c][1:2] == '\n') else ' ' for c_i, c in enumerate(idxs)] print '-------------------------------------' print "Pred:", k_i, code, ' '.join(highlight) print "" print np.abs(codes_max - y_pred_max)/float(len(codes_max)) if k_i % iter_display == 0 and k_i > 0: d = float(k_i+1) print '%d\t%d\t%.4f\t%.4f\t%.4f' % \ (epoch, k_i, sum_log_loss/d, sum_loss/d, mistakes/d) if not no_eval: valid_z_o_loss, valid_log_loss, valid_f1 = evaluate(idxs_dict, word_labels, dev_indices, bilstm, vocab) test_z_o_loss = 0 print ('epoch=%d\tdev_log_loss=%.3f\tdev_0/1=%.3f\tvalid_f1=%.3f') % (epoch, valid_log_loss, valid_z_o_loss, valid_f1) # decay learning rate lr *= lr_decay