def main():
    usage = "%prog project documents.json"
    parser = OptionParser(usage=usage)
    parser.add_option('-a', dest='alpha', default=0.00001,
                      help='Regularization strength: default=%default')
    parser.add_option('-d', dest='hidden_dim', default=50,
                      help='Hidden node dimension: default=%default')
    parser.add_option('-e', dest='epochs', default=10,
                      help='Number of epochs: default=%default')
    parser.add_option('-i', dest='iter_display', default=5000,
                      help='Number of iterations between output: default=%default')
    parser.add_option('-o', dest='optimization', default='sgd',
                      help='Optimization method [sgd|sgdm|adagrad]: default=%default')
    parser.add_option('-l', dest='learning_rate', default=0.1,
                      help='Initial learning rate: default=%default')
    parser.add_option('--decay', dest='decay', default=1.00,
                      help='Learning rate decay: default=%default')
    parser.add_option('--momentum', dest='momentum', default=0.5,
                      help='Momentum parameter (sgdm only): default=%default')
    parser.add_option('--word2vec_file', dest='word2vec_file', default='',
                      help='Location of word2vec file: default=do not load')
    parser.add_option('--glove_file', dest='glove_file', default='',
                      help='Location of glove file: default=do not load')
    parser.add_option('--save_vectors', action="store_true", dest="save_vectors", default=False,
                      help='Save loaded vectors for faster loading next time: default=%default')
    parser.add_option('-s', dest='seed', default=42,
                      help='Random seed: default=%default')
    parser.add_option('--no_eval', action="store_true", dest="no_eval", default=False,
                      help='Skip the evaluation between epochs: default=%default')
    parser.add_option('--test_fold', dest='test_fold', default=0,
                      help='Test fold: default=%default')
    parser.add_option('--dev_fold', dest='dev_fold', default=0,
                      help='Dev fold: default=%default')
    parser.add_option('--n_labels', dest='n_labels', default=14,
                      help='Number of labels to use (max 15): default=%default')
    parser.add_option('--w_word', dest='w_word', default=1.0,
                      help='Weight on word prediction: default=%default')
    parser.add_option('--w_sentence', dest='w_sentence', default=1.0,
                      help='Weight on word prediction: default=%default')
    parser.add_option('--w_article', dest='w_article', default=1.0,
                      help='Weight on word prediction: default=%default')


    (options, args) = parser.parse_args()
    project_name = args[0]
    input_filename = args[1]
    dirs.make_base_dir(project_name)
    sents_dir = dirs.data_raw_sentences_dir

    seed = int(options.seed)
    n_epochs = int(options.epochs)
    alpha = float(options.alpha)
    lr = float(options.learning_rate)
    iter_display = int(options.iter_display)
    opti_method = options.optimization
    lr_decay = float(options.decay)
    momentum = float(options.momentum)
    no_eval = options.no_eval
    word2vec_file = options.word2vec_file
    glove_file = options.glove_file
    save_vectors = options.save_vectors
    test_fold = int(options.test_fold)
    dev_fold = int(options.dev_fold)
    n_labels = int(options.n_labels)
    w_word = float(options.w_word)
    w_sentence = float(options.w_sentence)
    w_article = float(options.w_article)

    if seed > 0:
        np.random.seed(seed)
        random.seed(seed)

    dh = int(options.hidden_dim)
    dx = 300

    np.__config__.show()

    article_sent_words, article_word_labels, vocab, n_labels, n_unique_articles, annotation_counts = load_data(input_filename, n_labels)
    train_keys, dev_keys, test_keys = ds.get_all_splits(test_fold=test_fold, dev_subfold=dev_fold)

    vocab = vocab.keys()
    vocab.sort()
    vocab_size = len(vocab)
    vocab_index = dict(zip(vocab, range(vocab_size)))
    print "Vocab size =", vocab_size

    n_articles = len(article_sent_words)
    keys = article_sent_words.keys()
    keys.sort()
    print keys[:10]
    print "Loaded %d annotations for %d articles using %d labels" % (n_articles, n_unique_articles, n_labels)


    print list(train_keys)[:10]
    train_keys = [k for k in keys if k.split('__')[0] in train_keys]
    dev_keys = [k for k in keys if k.split('__')[0] in dev_keys]
    test_keys = [k for k in keys if k.split('__')[0] in test_keys]

    #dev_indices = np.random.choice(n_articles, n_dev, replace=False).tolist()
    #train_indices = list(set(range(n_articles)) - set(dev_indices))

    #train_keys = [keys[i] for i in train_indices]
    #dev_keys = [keys[i] for i in dev_indices]

    if glove_file != '':
        initial_embeddings = vector_utils.load_glove_vectors(glove_file, vocab, dx)

    elif word2vec_file != '':
        initial_embeddings = vector_utils.load_word2vec_vectors(word2vec_file, vocab, dx)

    else:
        initial_embeddings, vocab, vocab_index = vector_utils.load_from_file(input_filename)
        vocab_size = len(vocab)

    if save_vectors:
        vector_utils.save_vectors(input_filename, initial_embeddings, vocab)

    # index words into vocabulary and make mask and label arrays
    idxs_dict = {}
    mask_dict = {}
    label_dict = {}
    for key, sent_words in article_sent_words.items():
        n_sents = len(sent_words)
        max_len = max([len(s) for s in sent_words])
        word_idxs = np.zeros([max_len, n_sents], dtype=np.int32)
        mask = np.zeros([max_len, n_sents], dtype=np.int32)
        labels = np.zeros([max_len, n_sents, n_labels], dtype=np.int32)
        for s_i, s in enumerate(sent_words):
            n_words = len(s)
            word_idxs[:n_words, s_i] = [vocab_index[w] for w in s]
            mask[:n_words, s_i] = 1
            labels[:n_words, s_i, :] = article_word_labels[key][s_i][:, :]
        idxs_dict[key] = word_idxs
        mask_dict[key] = mask
        label_dict[key] = labels

    article_lengths = [(idxs_dict[k].size, k) for k in train_keys]
    article_lengths.sort()

    # create the LSTM
    theano_seed = np.random.randint(2 ** 30)
    print "Number of distributions =", 2
    print "Building RNN"

    optimizer, opti_params = get_optimizer(opti_method, momentum)
    bilstm = BiLSTM(vocab_size, dh, dx, n_labels, optimizer, opti_params, initial_embeddings=initial_embeddings,
                    alpha=alpha, update=opti_method, seed=theano_seed, momentum=momentum,
                    word_weight=w_word, sent_weight=w_sentence, article_weight=w_article)  # create RNN

    best_dev_f1 = np.zeros(n_labels)
    corr_test_f1 = np.zeros(n_labels)

    print "Training"
    for epoch in range(n_epochs):
        sum_log_loss = 0
        sum_loss = 0
        mistakes = 0
        # sort by keys on the first pass, then shuffle
        if epoch == 0:
            keys = [key for length, key in article_lengths]
        else:
            keys = train_keys
            random.shuffle(keys)
        print "epoch\titems\tloss\tl+reg\terrs"

        # consider each sentence in turn
        for k_i, k in enumerate(keys):
            idxs = idxs_dict[k]
            mask = mask_dict[k]
            word_labels = label_dict[k]

            p_word_labels, p_sent_labels, p_article_labels, log_loss, loss = bilstm.train(idxs, mask, word_labels, lr, 1)
            sum_log_loss += log_loss
            sum_loss += loss

            y_pred_words = np.array(p_word_labels > 0.5, dtype=int)  # (n_words, n_sents, n_labels)
            y_pred_sents = np.array(p_sent_labels > 0.5, dtype=int)
            y_pred_article = np.array(p_article_labels > 0.5, dtype=int)

            sent_labels = np.max(word_labels, axis=0)
            article_labels = np.max(sent_labels, axis=0)
            mistakes += np.sum(np.abs(article_labels - y_pred_article))/float(n_labels)

            to_print = False
            if k_i == 0 and to_print:
                print "\tTraining example:", k
                print article_labels
                print np.array(y_pred_article, dtype=int)
                max_len, n_sents = mask.shape
                for s_i in range(n_sents):
                    if np.max(y_pred_words[:, s_i, :]) == 1:
                        n_words = np.argmin(mask[:, s_i]) - 1
                        sentence = [vocab[c] for c in idxs[:n_words, s_i]]
                        print "Full:", k_i, ' '.join(sentence)
                        for code in range(n_labels):
                            if y_pred_sents[s_i, code] == 1:
                                highlight = [w if word_labels[w_i, s_i, code] else ' ' * len(w) for w_i, w in enumerate(sentence)]
                                print '-------------------------------------'
                                print "True:", k_i, code, ' '.join(highlight)
                                highlight = [w if y_pred_words[w_i, s_i, code] else ' ' * len(w) for w_i, w in enumerate(sentence)]
                                #highlight = [vocab[c][1:2] if (p_y_given_x[c_i, code] > 0.5 or vocab[c][1:2] == '\n') else ' ' for c_i, c in enumerate(idxs)]
                                print '-------------------------------------'
                                print "Pred:", k_i, code, ' '.join(highlight)
                                print ""

            if k_i % iter_display == 0 and k_i > 0:
                d = float(k_i+1)
                print '%d\t%d\t%.4f\t%.4f\t%.4f' % \
                      (epoch, k_i, sum_log_loss/d, sum_loss/d, mistakes/d)

        if not no_eval:
            print "\nDev evaluation"
            valid_z_o_loss, valid_log_loss, valid_f1, valid_per_class_f1 = evaluate(idxs_dict, mask_dict, label_dict, dev_keys, bilstm, vocab, annotation_counts)
            print "\nTest evaluation"
            test_z_o_loss, test_log_loss, test_f1, test_per_class_f1 = evaluate(idxs_dict, mask_dict, label_dict, test_keys, bilstm, vocab, annotation_counts)
            print ('epoch=%d\tdev_log_loss=%.3f\tdev_0/1=%.3f\tdev_f1=%.3f\ttest_log_loss=%.3f\ttest_0/1=%.3f\ttest_f1=%.3f\t') % (epoch, valid_log_loss, valid_z_o_loss, valid_f1, test_log_loss, test_z_o_loss, test_f1)
            for k in range(n_labels):
                if valid_per_class_f1[k] > best_dev_f1[k]:
                    best_dev_f1[k] = valid_per_class_f1[k]
                    corr_test_f1[k] = test_per_class_f1[k]
            print "Best valid f1s:", best_dev_f1
            print "Corr. test f1s:", corr_test_f1

        # decay learning rate
        lr *= lr_decay
Esempio n. 2
0
def main():
    usage = "%prog project documents.json"
    parser = OptionParser(usage=usage)
    parser.add_option('-a', dest='alpha', default=0.00001,
                      help='Regularization strength: default=%default')
    parser.add_option('-g', dest='gamma', default=0.5,
                      help='Gamma (proportional weight on words): default=%default')
    parser.add_option('-d', dest='hidden_dim', default=50,
                      help='Hidden node dimension: default=%default')
    parser.add_option('-e', dest='epochs', default=20,
                      help='Number of epochs: default=%default')
    parser.add_option('-i', dest='iter_display', default=5000,
                      help='Number of iterations between output: default=%default')
    parser.add_option('-o', dest='optimization', default='adagrad',
                      help='Optimization method [sgd|sgdm|adagrad]: default=%default')
    parser.add_option('-l', dest='learning_rate', default=0.05,
                      help='Initial learning rate: default=%default')
    parser.add_option('--decay', dest='decay', default=1.00,
                      help='Learning rate decay: default=%default')
    parser.add_option('--momentum', dest='momentum', default=0.5,
                      help='Momentum parameter (sgdm only): default=%default')
    parser.add_option('--word2vec_file', dest='word2vec_file', default='',
                      help='Location of word2vec file: default=do not load')
    parser.add_option('--glove_file', dest='glove_file', default='',
                      help='Location of glove file: default=do not load')
    parser.add_option('--save_vectors', action="store_true", dest="save_vectors", default=False,
                      help='Save loaded vectors for faster loading next time: default=%default')
    parser.add_option('-s', dest='seed', default=42,
                      help='Random seed: default=%default')
    parser.add_option('--no_eval', action="store_true", dest="no_eval", default=False,
                      help='Skip the evaluation between epochs: default=%default')
    parser.add_option('--n_dev', dest='n_dev', default=500,
                      help='Number of random dev sentences: default=%default')

    (options, args) = parser.parse_args()
    project_name = args[0]
    input_filename = args[1]
    dirs.make_base_dir(project_name)
    sents_dir = dirs.data_raw_sentences_dir

    seed = int(options.seed)
    n_epochs = int(options.epochs)
    alpha = float(options.alpha)
    gamma = float(options.gamma)
    lr = float(options.learning_rate)
    iter_display = int(options.iter_display)
    opti_method = options.optimization
    lr_decay = float(options.decay)
    momentum = float(options.momentum)
    no_eval = options.no_eval
    n_dev = int(options.n_dev)
    word2vec_file = options.word2vec_file
    glove_file = options.glove_file
    save_vectors = options.save_vectors


    if seed > 0:
        np.random.seed(seed)
        random.seed(seed)

    dh = int(options.hidden_dim)
    dx = 300
    nd = 2

    np.__config__.show()

    sents, word_labels, vocab, n_labels = load_data(input_filename)

    vocab = vocab.keys()
    vocab.sort()
    vocab_size = len(vocab)
    vocab_index = dict(zip(vocab, range(vocab_size)))
    print "Vocab size =", vocab_size

    n_articles = len(sents)
    print "Loaded", n_articles, "annotated sentences"
    dev_indices = np.random.choice(n_articles, n_dev, replace=False).tolist()
    train_indices = list(set(range(n_articles)) - set(dev_indices))

    n_train = len(train_indices)
    n_dev = len(dev_indices)


    if glove_file != '':
        initial_embeddings = vector_utils.load_glove_vectors(glove_file, vocab, dx)

    elif word2vec_file != '':
        initial_embeddings = vector_utils.load_word2vec_vectors(word2vec_file, vocab, dx)

    else:
        initial_embeddings, vocab, vocab_index = vector_utils.load_from_file(input_filename)
        vocab_size = len(vocab)

    if save_vectors:
        vector_utils.save_vectors(input_filename, initial_embeddings, vocab)

    idxs_dict = {}
    for k in range(n_articles):
        words = sents[k]
        idxs_dict[k] = np.array([vocab_index[w] for w_i, w in enumerate(words)], dtype=np.int32)

    sent_lengths = [(len(idxs_dict[k]), k) for k in train_indices]
    sent_lengths.sort()

    # create the LSTM
    theano_seed = np.random.randint(2 ** 30)
    print "Number of distributions =", 2
    print "Building RNN"
    bilstm = BiLSTM(vocab_size, dh, dx, n_labels, initial_embeddings=initial_embeddings, alpha=alpha,
                    update=opti_method, seed=theano_seed, momentum=momentum, gamma=gamma)  # create RNN

    print "Training"
    for epoch in range(n_epochs):
        sum_log_loss = 0
        sum_loss = 0
        mistakes = 0
        # sort by keys on the first pass, then shuffle
        if epoch == 0:
            keys = [key for length, key in sent_lengths]
        else:
            keys = train_indices
            random.shuffle(keys)
        print "epoch\titems\tloss\tl+reg\terrs"

        # consider each sentence in turn
        for k_i, k in enumerate(keys):
            text = sents[k]
            idxs = idxs_dict[k]
            codes = word_labels[k]

            y_pred, y_pred_max, log_loss, loss = bilstm.train(np.array(idxs, dtype=np.int32), codes, lr, 1)
            sum_log_loss += log_loss
            sum_loss += loss

            codes_max = np.max(codes, axis=0)
            mistakes += np.sum(np.abs(codes_max - y_pred_max))/float(len(codes_max))

            if k_i == -1:
                print "Training example:"
                sentence = [vocab[c] for c in idxs]
                print k_i, ' '.join(sentence)
                print codes_max
                print y_pred_max
                for code in range(len(y_pred_max)):
                    if y_pred_max[code] == 1:
                        highlight = [w if codes[w_i, code] else ' ' * len(w) for w_i, w in enumerate(sentence)]
                        print '-------------------------------------'
                        print "True:", k_i, code, ' '.join(highlight)
                        highlight = [w if y_pred[w_i, code] else ' ' * len(w) for w_i, w in enumerate(sentence)]
                        #highlight = [vocab[c][1:2] if (p_y_given_x[c_i, code] > 0.5 or vocab[c][1:2] == '\n') else ' ' for c_i, c in enumerate(idxs)]
                        print '-------------------------------------'
                        print "Pred:", k_i, code, ' '.join(highlight)
                        print ""
                print np.abs(codes_max - y_pred_max)/float(len(codes_max))


            if k_i % iter_display == 0 and k_i > 0:
                d = float(k_i+1)
                print '%d\t%d\t%.4f\t%.4f\t%.4f' % \
                      (epoch, k_i, sum_log_loss/d, sum_loss/d, mistakes/d)

        if not no_eval:
            valid_z_o_loss, valid_log_loss, valid_f1 = evaluate(idxs_dict, word_labels, dev_indices, bilstm, vocab)
            test_z_o_loss = 0
            print ('epoch=%d\tdev_log_loss=%.3f\tdev_0/1=%.3f\tvalid_f1=%.3f') % (epoch, valid_log_loss, valid_z_o_loss, valid_f1)


        # decay learning rate
        lr *= lr_decay