Esempio n. 1
0
    def predict(self, test_data, model_details=None, options={}):
        super(knet, self).predict(test_data, model_details, options)
        assert len(test_data) != 0, "test_data list shouldn't be empty"
        self.test_file = test_data[0]
        if not os.path.exists(self.test_file):
            assert False, "File doesn't exists"

        print("Start Predicting")
        direct_entity, direct_context, self.predict_types = util.raw2npy(
            self.test_file)

        embedding = np.load(self.embedding)
        model = models.KA_D("KA+D", self.disamb_file)

        sess = tf.Session()
        w2v = util.build_vocab(self.glove, model.word_size)
        sess.run(model.initializer)
        model.saver.restore(sess, self.model_name)
        util.printlog("Begin computing direct outputs")
        self.final_result = util.direct(w2v, sess, model, direct_entity,
                                        direct_context, embedding,
                                        self.type_file)

        dir_name = os.path.dirname(test_data[0])
        output_file = os.path.join(dir_name, "entity_typing_test_output.txt")
        final_str = ""
        for i in range(len(self.final_result)):
            final_str = "{}\n{}\t{}\t{}".format(final_str,
                                                " ".join(direct_entity[i]),
                                                self.predict_types[i],
                                                self.final_result[i].lower())
        with open(output_file, 'w') as fin:
            fin.write(final_str.strip())

        return output_file
Esempio n. 2
0
def main(args):
    with open('data/multim_poem.json') as f, open(
            'data/unim_poem.json') as unif:
        multim = json.load(f)
        unim = json.load(unif)
    if args.bert:
        word2idx, idx2word = util.build_vocab_bert(unim + multim,
                                                   args.threshold)
    else:
        word2idx, idx2word = util.build_vocab(unim + multim, args.threshold)
    sys.stderr.write('vocab size {}\n'.format(len(word2idx)))
    if args.bert:
        with open('./data/vocab_bert.pkl', 'wb') as f:
            pickle.dump([word2idx, idx2word], f)
    with open(args.vocab_path, 'wb') as f:
        pickle.dump([word2idx, idx2word], f)
Esempio n. 3
0
from sklearn.metrics import accuracy_score as acc


def score(w, models, log_prob=True):
    raw = models["UK"].transform(
        w, log_prob=log_prob) / models["USA"].transform(w, log_prob=log_prob)
    return max(raw, 1 / raw) - 1


models = {}
for country in ["UK", "USA"]:
    print("Country", country)
    data = list(
        map(str,
            util.load_data("./data/%s_tokenized.txt" % country)[5].tolist()))
    vocab = util.build_vocab(data)
    inverted_vocab = {k: v for v, k in enumerate(vocab)}

    docs = []
    for d in tqdm(data, desc="Processing docs"):
        docs.append(
            np.array(
                list(
                    map(
                        lambda x: inverted_vocab[x]
                        if x in inverted_vocab else -1, d.split(" ")))))

    model = FrequencyModel(inverted_vocab)
    model.fit(docs)
    models[country] = model
Esempio n. 4
0
import pandas as pd
import numpy as np
import random
import re
import util


# regexp = re.compile('[,.;:@#?!&$”\"\-]+')
SPECIAL = ",.;:@#?!&$”\"\-"
inverted_vocabs = {}
for c in ["UK", "USA"]:
    data = list(map(str, util.load_data("./data/%s_tokenized.txt" % c)[5].tolist()))
    vocab = util.build_vocab(data, least_freq=21)
    inverted_vocabs[c] = {k: v for v, k in enumerate(vocab)}

print("start joint_vocab")
joint_vocab = set(inverted_vocabs["UK"].keys()) & set(inverted_vocabs["USA"].keys())
joint_vocab = {w for w in joint_vocab if not any(special in w for special in SPECIAL)}
word_list = pd.read_csv("./data/word_list.csv", encoding="gbk")

# Clean word list
words = word_list["Word"]
for i in range(len(words)):
    words[i] = re.sub("\(.+\)", "", words[i])
    words[i] = re.sub("\[.+\]", "", words[i])
    words[i] = re.sub("\r\n", ",", words[i])
    words[i] = re.sub(" ", "", words[i])
word_list["Word"] = words


# Match joint words
Esempio n. 5
0
def run(data_file, is_train=False, **args):
    is_test = not is_train
    batchsize = args['batchsize']
    model_name = args['model_name']
    optimizer_name = args['optimizer']
    save_dir = args['save_dir']
    print args
    if save_dir[-1] != '/':
        save_dir = save_dir + '/'

    # TODO: check save_dir exist
    if not os.path.isdir(save_dir):
        err_msg = 'There is no dir : {}\n'.format(save_dir)
        err_msg += '##############################\n'
        err_msg += '## Please followiing: \n'
        err_msg += '## $ mkdir {}\n'.format(save_dir)
        err_msg += '##############################\n'
        raise ValueError(err_msg)

    save_name = args['save_name']
    if save_name == '':
        save_name = '_'.join([model_name, optimizer_name])

    save_name = save_dir + save_name

    xp = cuda.cupy if args['gpu'] >= 0 else np
    efficient_gpu = False
    if args['gpu'] >= 0:
        cuda.get_device(args['gpu']).use()
        xp.random.seed(1234)
        efficient_gpu = args.get('efficient_gpu', False)

    def to_gpu(x):
        if args['gpu'] >= 0:
            return chainer.cuda.to_gpu(x)
        return x

    # load files
    dev_file = args['dev_file']
    test_file = args['test_file']
    delimiter = args['delimiter']
    input_idx = map(int, args['input_idx'].split(','))
    output_idx = map(int, args['output_idx'].split(','))
    word_input_idx = input_idx[0]  # NOTE: word_idx is first column!
    additional_input_idx = input_idx[1:]
    sentences_train = []
    if is_train:
        sentences_train = util.read_conll_file(filename=data_file,
                                               delimiter=delimiter)
        if len(sentences_train) == 0:
            s = str(len(sentences_train))
            err_msg = 'Invalid training sizes: {} sentences. '.format(s)
            raise ValueError(err_msg)
    else:
        # Predict
        sentences_train = util.read_raw_file(filename=data_file,
                                             delimiter=u' ')

    # sentences_train = sentences_train[:100]

    sentences_dev = []
    sentences_test = []
    if dev_file:
        sentences_dev = util.read_conll_file(dev_file, delimiter=delimiter)
    if test_file:
        sentences_test = util.read_conll_file(test_file, delimiter=delimiter)

    save_vocab = save_name + '.vocab'
    save_vocab_char = save_name + '.vocab_char'
    save_tags_vocab = save_name + '.vocab_tag'
    save_train_config = save_name + '.train_config'

    # TODO: check unkown pos tags
    # TODO: compute unk words
    vocab_adds = []
    if is_train:
        sentences_words_train = [[w_obj[word_input_idx] for w_obj in sentence]
                                 for sentence in sentences_train]
        vocab = util.build_vocab(sentences_words_train)
        vocab_char = util.build_vocab(util.flatten(sentences_words_train))
        vocab_tags = util.build_tag_vocab(sentences_train)

        # Additional setup
        for ad_feat_id in additional_input_idx:
            sentences_additional_train = [[feat_obj[ad_feat_id] for feat_obj in sentence]
                                          for sentence in sentences_train]
            vocab_add = util.build_vocab(sentences_additional_train)
            vocab_adds.append(vocab_add)
    elif is_test:
        vocab = util.load_vocab(save_vocab)
        vocab_char = util.load_vocab(save_vocab_char)
        vocab_tags = util.load_vocab(save_tags_vocab)

    if args.get('word_emb_file', False):
        # set Pre-trained embeddings
        # emb_file = './emb/glove.6B.100d.txt'
        emb_file = args['word_emb_file']
        word_emb_vocab_type = args.get('word_emb_vocab_type')

        def assert_word_emb_shape(shape1, shape2):
            err_msg = '''Pre-trained embedding size is not equal to `--n_word_emb` ({} != {})'''
            if shape1 != shape2:
                err_msg = err_msg.format(str(shape1), str(shape2))
                raise ValueError(err_msg)

        def assert_no_emb(word_vecs):
            err_msg = '''There is no-embeddings! Please check your file `--word_emb_file`'''
            if word_vecs.shape[0] == 0:
                raise ValueError(err_msg)

        if word_emb_vocab_type == 'replace_all':
            # replace all vocab by Pre-trained embeddings
            word_vecs, vocab_glove = util.load_glove_embedding_include_vocab(emb_file)
            vocab = vocab_glove
        elif word_emb_vocab_type == 'replace_only':
            word_ids, word_vecs = util.load_glove_embedding(emb_file, vocab)
            assert_no_emb(word_vecs)

        elif word_emb_vocab_type == 'additional':
            word_vecs, vocab_glove = util.load_glove_embedding_include_vocab(emb_file)
            additional_vecs = []
            for word, word_idx in sorted(vocab_glove.items(), key=lambda x: x[1]):
                if word not in vocab:
                    vocab[word] = len(vocab)
                    additional_vecs.append(word_vecs[word_idx])
            additional_vecs = np.array(additional_vecs, dtype=np.float32)

    if args.get('vocab_file', False):
        vocab_file = args['vocab_file']
        vocab = util.load_vocab(vocab_file)

    if args.get('vocab_char_file', False):
        vocab_char_file = args['vocab_char_file']
        vocab_char = util.load_vocab(vocab_char_file)

    vocab_tags_inv = dict((v, k) for k, v in vocab_tags.items())
    PAD_IDX = vocab[PADDING]
    UNK_IDX = vocab[UNKWORD]

    CHAR_PAD_IDX = vocab_char[PADDING]
    CHAR_UNK_IDX = vocab_char[UNKWORD]

    tmp_xp = xp
    if efficient_gpu:
        tmp_xp = np  # use CPU (numpy)

    def parse_to_word_ids(sentences, word_input_idx, vocab):
        return util.parse_to_word_ids(sentences, xp=tmp_xp, vocab=vocab,
                                      UNK_IDX=UNK_IDX, idx=word_input_idx)

    def parse_to_char_ids(sentences):
        return util.parse_to_char_ids(sentences, xp=tmp_xp, vocab_char=vocab_char,
                                      UNK_IDX=CHAR_UNK_IDX, idx=word_input_idx)

    def parse_to_tag_ids(sentences):
        return util.parse_to_tag_ids(sentences, xp=tmp_xp, vocab=vocab_tags,
                                     UNK_IDX=-1, idx=-1)

    x_train = parse_to_word_ids(sentences_train, word_input_idx, vocab)
    x_char_train = parse_to_char_ids(sentences_train)
    y_train = parse_to_tag_ids(sentences_train)
    x_train_additionals = [parse_to_word_ids(sentences_train, ad_feat_id, vocab_adds[i])
                           for i, ad_feat_id in enumerate(additional_input_idx)]

    x_dev = parse_to_word_ids(sentences_dev, word_input_idx, vocab)
    x_char_dev = parse_to_char_ids(sentences_dev)
    y_dev = parse_to_tag_ids(sentences_dev)
    x_dev_additionals = [parse_to_word_ids(sentences_dev, ad_feat_id, vocab_adds[i])
                         for i, ad_feat_id in enumerate(additional_input_idx)]

    y_dev_cpu = [[w[-1] for w in sentence]
                 for sentence in sentences_dev]
    # tag_names = []
    tag_names = list(set([tag[2:] if len(tag) >= 2 else tag[0] for tag in vocab_tags.keys()]))

    x_test = parse_to_word_ids(sentences_test, word_input_idx, vocab)
    x_char_test = parse_to_char_ids(sentences_test)
    y_test = parse_to_tag_ids(sentences_test)
    x_test_additionals = [parse_to_word_ids(sentences_test, ad_feat_id, vocab_adds[i])
                          for i, ad_feat_id in enumerate(additional_input_idx)]

    cnt_train_unk = sum([tmp_xp.sum(d == UNK_IDX) for d in x_train])
    cnt_train_word = sum([d.size for d in x_train])
    unk_train_unk_rate = float(cnt_train_unk) / cnt_train_word

    cnt_dev_unk = sum([tmp_xp.sum(d == UNK_IDX) for d in x_dev])
    cnt_dev_word = sum([d.size for d in x_dev])
    unk_dev_unk_rate = float(cnt_dev_unk) / max(cnt_dev_word, 1)

    logging.info('train:' + str(len(x_train)))
    logging.info('dev  :' + str(len(x_dev)))
    logging.info('test :' + str(len(x_test)))
    logging.info('vocab     :' + str(len(vocab)))
    logging.info('vocab_tags:' + str(len(vocab_tags)))
    logging.info('unk count (train):' + str(cnt_train_unk))
    logging.info('unk rate  (train):' + str(unk_train_unk_rate))
    logging.info('cnt all words (train):' + str(cnt_train_word))
    logging.info('unk count (dev):' + str(cnt_dev_unk))
    logging.info('unk rate  (dev):' + str(unk_dev_unk_rate))
    logging.info('cnt all words (dev):' + str(cnt_dev_word))
    # show model config
    logging.info('######################')
    logging.info('## Model Config')
    logging.info('model_name:' + str(model_name))
    logging.info('batchsize:' + str(batchsize))
    logging.info('optimizer:' + str(optimizer_name))
    # Save model config
    logging.info('######################')
    logging.info('## Model Save Config')
    logging.info('save_dir :' + str(save_dir))

    # save vocab
    logging.info('save_vocab        :' + save_vocab)
    logging.info('save_vocab_char   :' + save_vocab_char)
    logging.info('save_tags_vocab   :' + save_tags_vocab)
    logging.info('save_train_config :' + save_train_config)

    init_emb = None

    if is_train:
        util.write_vocab(save_vocab, vocab)
        util.write_vocab(save_vocab_char, vocab_char)
        util.write_vocab(save_tags_vocab, vocab_tags)
        util.write_vocab(save_train_config, args)

    n_vocab_add = [len(_vadd) for _vadd in vocab_adds]

    net = BiLSTM_CNN_CRF(n_vocab=len(vocab), n_char_vocab=len(vocab_char),
                         emb_dim=args['n_word_emb'],
                         hidden_dim=args['n_hidden'],
                         n_layers=args['n_layer'], init_emb=init_emb,
                         char_input_dim=args['n_char_emb'],
                         char_hidden_dim=args['n_char_hidden'],
                         n_label=len(vocab_tags),
                         n_add_feature_dim=args['n_add_feature_emb'],
                         n_add_feature=len(n_vocab_add),
                         n_vocab_add=n_vocab_add,
                         use_cudnn=args['use_cudnn'])
    my_cudnn(args['use_cudnn'])

    if args.get('word_emb_file', False):

        if word_emb_vocab_type == 'replace_all':
            # replace all vocab by Pre-trained embeddings
            assert_word_emb_shape(word_vecs.shape[1], net.word_embed.W.shape[1])
            net.word_embed.W.data = word_vecs[:]
        elif word_emb_vocab_type == 'replace_only':
            assert_no_emb(word_vecs)
            assert_word_emb_shape(word_vecs.shape[1], net.word_embed.W.shape[1])
            net.word_embed.W.data[word_ids] = word_vecs[:]

        elif word_emb_vocab_type == 'additional':
            assert_word_emb_shape(word_vecs.shape[1], net.word_embed.W.shape[1])
            v_size = additional_vecs.shape[0]
            net.word_embed.W.data[-v_size:] = additional_vecs[:]

    if args.get('return_model', False):
        return net

    if args['gpu'] >= 0:
        net.to_gpu()

    init_alpha = args['init_lr']
    if optimizer_name == 'adam':
        opt = optimizers.Adam(alpha=init_alpha, beta1=0.9, beta2=0.9)
    elif optimizer_name == 'adadelta':
        opt = optimizers.AdaDelta()
    if optimizer_name == 'sgd_mom':
        opt = optimizers.MomentumSGD(lr=init_alpha, momentum=0.9)
    if optimizer_name == 'sgd':
        opt = optimizers.SGD(lr=init_alpha)

    opt.setup(net)
    opt.add_hook(chainer.optimizer.GradientClipping(5.0))

    def eval_loop(x_data, x_char_data, y_data, x_train_additionals=[]):
        # dev or test
        net.set_train(train=False)
        iteration_list = range(0, len(x_data), batchsize)
        # perm = np.random.permutation(len(x_data))
        sum_loss = 0.0
        predict_lists = []
        for i_index, index in enumerate(iteration_list):
            x = x_data[index:index + batchsize]
            x_char = x_char_data[index:index + batchsize]
            target_y = y_data[index:index + batchsize]

            if efficient_gpu:
                x = [to_gpu(_) for _ in x]
                x_char = [[to_gpu(_) for _ in words] for words in x_char]
                target_y = [to_gpu(_) for _ in target_y]

            x_additional = []
            if len(x_train_additionals):
                x_additional = [[to_gpu(_) for _ in x_ad[index:index + batchsize]]
                                for x_ad in x_train_additionals]

            output = net(x_data=x, x_char_data=x_char, x_additional=x_additional)
            predict, loss = net.predict(output, target_y)

            sum_loss += loss.data
            predict_lists.extend(predict)

        _, predict_tags = zip(*predict_lists)
        predicted_results = []
        for predict in predict_tags:
            predicted = [vocab_tags_inv[tag_idx] for tag_idx in to_cpu(predict)]
            predicted_results.append(predicted)

        return predict_lists, sum_loss, predicted_results

    if args['model_filename']:
        model_filename = args['model_filename']
        serializers.load_hdf5(model_filename, net)

    if is_test:
        # predict
        # model_filename = args['model_filename']
        # model_filename = save_dir + model_filename
        # serializers.load_hdf5(model_filename, net)
        vocab_tags_inv = dict([(v, k) for k, v in vocab_tags.items()])
        x_predict = x_train
        x_char_predict = x_char_train
        y_predict = y_train

        if dev_file:
            predict_dev, loss_dev, predict_dev_tags = eval_loop(x_dev, x_char_dev, y_dev)
            gold_predict_pairs = [y_dev_cpu, predict_dev_tags]
            result, phrase_info = util.conll_eval(
                gold_predict_pairs, flag=False, tag_class=tag_names)
            all_result = result['All_Result']
            print 'all_result:', all_result

        predict_pairs, _, _tmp = eval_loop(x_predict, x_char_predict, y_predict)
        _, predict_tags = zip(*predict_pairs)
        predicted_output = args['predicted_output']
        predicted_results = []
        for predict in predict_tags:
            predicted = [vocab_tags_inv[tag_idx] for tag_idx in to_cpu(predict)]
            predicted_results.append(predicted)

        f = open(predicted_output, 'w')
        for predicted in predicted_results:
            for tag in predicted:
                f.write(tag + '\n')
            f.write('\n')
        f.close()

        return False

    tmax = args['max_iter']
    t = 0.0
    prev_dev_accuracy = 0.0
    prev_dev_f = 0.0
    for epoch in xrange(args['max_iter']):

        # train
        net.set_train(train=True)
        iteration_list = range(0, len(x_train), batchsize)
        perm = np.random.permutation(len(x_train))
        sum_loss = 0.0
        predict_train = []
        for i_index, index in enumerate(iteration_list):
            data = [(x_train[i], x_char_train[i], y_train[i])
                    for i in perm[index:index + batchsize]]
            x, x_char, target_y = zip(*data)

            x_additional = []
            if len(x_train_additionals):
                x_additional = [[to_gpu(x_ad[add_i]) for add_i in perm[index:index + batchsize]]
                                for x_ad in x_train_additionals]

            if efficient_gpu:
                x = [to_gpu(_) for _ in x]
                x_char = [[to_gpu(_) for _ in words] for words in x_char]
                target_y = [to_gpu(_) for _ in target_y]

            output = net(x_data=x, x_char_data=x_char, x_additional=x_additional)
            predict, loss = net.predict(output, target_y)

            # loss
            sum_loss += loss.data

            # update
            net.zerograds()
            loss.backward()
            opt.update()

            predict_train.extend(predict)

        # Evaluation
        train_accuracy = util.eval_accuracy(predict_train)

        logging.info('epoch:' + str(epoch))
        logging.info(' [train]')
        logging.info('  loss     :' + str(sum_loss))
        logging.info('  accuracy :' + str(train_accuracy))

        # Dev
        predict_dev, loss_dev, predict_dev_tags = eval_loop(
            x_dev, x_char_dev, y_dev, x_dev_additionals)

        gold_predict_pairs = [y_dev_cpu, predict_dev_tags]
        result, phrase_info = util.conll_eval(gold_predict_pairs, flag=False, tag_class=tag_names)
        all_result = result['All_Result']

        # Evaluation
        dev_accuracy = util.eval_accuracy(predict_dev)
        logging.info(' [dev]')
        logging.info('  loss     :' + str(loss_dev))
        logging.info('  accuracy :' + str(dev_accuracy))
        logging.info('  f_measure :' + str(all_result[-1]))

        dev_f = all_result[-1]

        if prev_dev_f < dev_f:
            logging.info(' [update best model on dev set!]')
            dev_list = [prev_dev_f, dev_f]
            dev_str = '       ' + ' => '.join(map(str, dev_list))
            logging.info(dev_str)
            prev_dev_f = dev_f

            # Save model
            model_filename = save_name + '_epoch' + str(epoch)
            serializers.save_hdf5(model_filename + '.model', net)
            serializers.save_hdf5(model_filename + '.state', opt)
Esempio n. 6
0
    linkmanual = np.load(datadir + '/linkmanual.npy')

####### build model
if modelname == "SA":
    model = model.SA("SA")
elif modelname == "MA":
    model = model.MA("MA")
elif modelname == "KA":
    model = model.KA("KA")
elif modelname == "KA+D":
    model = model.KA_D("KA+D")
else:
    raise ValueError("No such model!")

sess = tf.Session()
w2v = util.build_vocab(w2vfile, model.word_size)
sess.run(model.initializer)

if args.load_model:
    model.saver.restore(sess, args.load_model)
elif not training:
    raise ValueError("Must load a model for testing!")

####### direct
if direct:
    util.printlog("Begin computing direct outputs")
    util.direct(w2v, sess, model, direct_entity, direct_context, embedding)

####### train
elif training:
    util.printlog("Begin training")
Esempio n. 7
0
    def train(self, train_data=None, options={}):
        super(knet, self).train(train_data, options)
        if not os.path.exists(self.model_dir):
            os.makedirs(self.model_dir)

        util.printlog("Loading Data")
        embedding = np.load(self.embedding)
        train_entity = np.load(self.train_entity)
        train_context = np.load(self.train_context)
        train_label = np.load(self.train_labels)
        train_fbid = np.load(self.train_fbid)

        valid_entity = np.load(self.valid_entity)
        valid_context = np.load(self.valid_context)
        valid_label = np.load(self.valid_labels)
        valid_fbid = np.load(self.valid_fbid)

        train_size = len(train_entity)
        if train_size < 500:
            batch_size = train_size
            iter_num = train_size
            check_freq = train_size
        elif train_size < 10000:
            batch_size = train_size / 100
            iter_num = train_size / 10
            check_freq = train_size / 100
        else:
            batch_size = train_size / 1000
            iter_num = train_size / 100
            check_freq = train_size / 1000

        batch_size = int(batch_size)
        iter_num = int(iter_num)
        check_freq = int(check_freq)

        model = models.KA_D("KA+D", self.disamb_file)

        sess = tf.Session()
        w2v = util.build_vocab(self.glove, model.word_size)
        sess.run(model.initializer)

        util.printlog("Begin training")

        for i in range(iter_num):
            if i % check_freq == 0:
                util.printlog("Validating after running " +
                              str(int(i * batch_size / train_size)) +
                              " epoches")
                util.test(w2v, model, valid_entity, valid_context, valid_label,
                          valid_fbid, embedding, batch_size, sess, "all")
                model.saver.save(sess, os.path.join(self.model_dir, str(i)))

            fd = model.fdict(w2v, (i * batch_size) % train_size, batch_size, 1,
                             train_entity, train_context, train_label,
                             train_fbid, embedding, False)
            fd[model.kprob] = 0.5
            sess.run(model.train, feed_dict=fd)

            if batch_size != train_size and i % int(
                    train_size / batch_size / 10) == 0:
                util.printlog("Epoch {}, Batch {}".format(
                    int((i * batch_size) / train_size),
                    int((i * batch_size) % train_size / batch_size)))
        model.saver.save(sess, self.model_name)
Esempio n. 8
0
def main():
    '''
    Main function that coordinates the entire process. Parses arguments that specify the exercise and the
    experiment that should be run. Initializes the model and the checkpoint managers.
    '''

    parser = argparse.ArgumentParser(
        description='Define configuration of experiments')
    parser.add_argument('--mode',
                        type=str,
                        nargs='+',
                        choices=['train', 'evaluate', 'generate'],
                        required=True)
    parser.add_argument('--experiment',
                        type=str,
                        choices=['a', 'b', 'c'],
                        required=True)
    parser.add_argument('--id', type=str, required=False)
    parser.add_argument('--epochs', type=int, default=EPOCHS, required=False)

    args = parser.parse_args()

    # Setting Experiment Id
    if args.id is None:
        exp_id = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        print(f"No Experiment Id Set, Creating New: {exp_id}")
    else:
        exp_id = args.id
        print(f"Using Experiment Id: {exp_id}")

    # Setting Directories
    base_dir = f"{OUTPUT_DIR}/exp_{args.experiment}/{exp_id}"
    log_dir = f"{base_dir}/logs"
    submission_dir = f"{base_dir}/submissions"
    if not os.path.exists(submission_dir):
        os.makedirs(submission_dir)
    ckpt_dir = f"{base_dir}/ckpts"

    print(f"Experiment Directory: {base_dir}")

    print(f"Using Tensorflow Version: {tf.__version__}")
    print("Building Vocabulary...")
    build_vocab(input_file=PATH_TRAIN,
                output_file=PATH_VOCAB,
                top_k=VOCAB_SIZE,
                special=SPECIAL)
    word2id, id2word = build_vocab_lookup(PATH_VOCAB, "<unk>")

    # Setting Experiment Specific Configurations
    if args.experiment == 'a':
        lstm_hidden_state_size = 512
        word_embeddings = None

    elif args.experiment == 'b':
        lstm_hidden_state_size = 512
        word_embeddings = load_embedding(dim_embedding=EMBEDDING_SIZE,
                                         vocab_size=VOCAB_SIZE)

    elif args.experiment == 'c':
        lstm_hidden_state_size = 1024
        word_embeddings = load_embedding(dim_embedding=EMBEDDING_SIZE,
                                         vocab_size=VOCAB_SIZE)
    else:
        raise ValueError(f"Unknown Experiment {args.experiment}")

    print(f'Initializing Model...')
    model = LanguageModel(vocab_size=VOCAB_SIZE,
                          sentence_length=SENTENCE_LENGTH,
                          embedding_size=EMBEDDING_SIZE,
                          hidden_state_size=lstm_hidden_state_size,
                          output_size=LSTM_OUTPUT_SIZE,
                          batch_size=BATCH_SIZE,
                          word_embeddings=word_embeddings,
                          index_to_word_table=id2word)

    print(f'Initializing Optimizer...')
    optimizer = tf.keras.optimizers.Adam()

    ckpt = tf.train.Checkpoint(step=tf.Variable(1),
                               optimizer=optimizer,
                               net=model)
    manager = tf.train.CheckpointManager(ckpt, ckpt_dir, max_to_keep=5)

    if manager.latest_checkpoint:
        print(f"Restoring Model from {manager.latest_checkpoint}...")
        ckpt.restore(manager.latest_checkpoint)
        model_loaded = True
    else:
        print("Initializing Model from Scratch")
        model_loaded = False

    if "train" in args.mode:
        print(f"Starting Training...")
        train_summary_writer = tf.summary.create_file_writer(
            f"{log_dir}/train")
        with train_summary_writer.as_default():
            train(ckpt=ckpt,
                  manager=manager,
                  model=model,
                  optimizer=optimizer,
                  word2id=word2id,
                  id2word=id2word,
                  epochs=args.epochs)
        model_loaded = True

    if "evaluate" in args.mode:
        print(f"Starting Evaluation...")
        assert model_loaded, 'model must be loaded from checkpoint in order to be evaluated'

        test_summary_writer = tf.summary.create_file_writer(
            f"{log_dir}/evaluate")
        with test_summary_writer.as_default():
            evaluate(
                model=model,
                word2id=word2id,
                id2word=id2word,
                step=optimizer.iterations,
                path_submission=
                f"{submission_dir}/group35.perplexity{args.experiment.upper()}"
            )

    if "generate" in args.mode:
        print(f"Starting Generation...")
        assert model_loaded, 'model must be loaded from checkpoint in order to start generation'

        generate_summary_writer = tf.summary.create_file_writer(
            f"{log_dir}/generate")
        with generate_summary_writer.as_default():
            generate(word2id,
                     id2word,
                     model=model,
                     path_submission=f"{submission_dir}/group35.continuation")
Esempio n. 9
0
def run(data_file, is_train=False, **args):
    is_test = not is_train
    batchsize = args['batchsize']
    model_name = args['model_name']
    optimizer_name = args['optimizer']
    save_dir = args['save_dir']
    print args
    if save_dir[-1] != '/':
        save_dir = save_dir + '/'

    # TODO: check save_dir exist
    if not os.path.isdir(save_dir):
        err_msg = 'There is no dir : {}\n'.format(save_dir)
        err_msg += '##############################\n'
        err_msg += '## Please followiing: \n'
        err_msg += '## $ mkdir {}\n'.format(save_dir)
        err_msg += '##############################\n'
        raise ValueError(err_msg)

    save_name = args['save_name']
    if save_name == '':
        save_name = '_'.join([model_name, optimizer_name])

    save_name = save_dir + save_name

    xp = cuda.cupy if args['gpu'] >= 0 else np
    if args['gpu'] >= 0:
        cuda.get_device(args['gpu']).use()
        xp.random.seed(1234)

    # load files
    dev_file = args['dev_file']
    test_file = args['test_file']
    delimiter = args['delimiter']
    sentences_train = []
    if is_train:
        sentences_train = util.read_conll_file(filename=data_file,
                                               delimiter=delimiter,
                                               input_idx=0,
                                               output_idx=-1)
        if len(sentences_train) == 0:
            s = str(len(sentences_train))
            err_msg = 'Invalid training sizes: {} sentences. '.format(s)
            raise ValueError(err_msg)
    else:
        # Predict
        sentences_train = util.read_raw_file(filename=data_file,
                                             delimiter=u' ')

    # sentences_train = sentences_train[:100]

    sentences_dev = []
    sentences_test = []
    if dev_file:
        sentences_dev = util.read_conll_file(dev_file,
                                             delimiter=delimiter,
                                             input_idx=0,
                                             output_idx=-1)
    if test_file:
        sentences_test = util.read_conll_file(test_file,
                                              delimiter=delimiter,
                                              input_idx=0,
                                              output_idx=-1)

    save_vocab = save_name + '.vocab'
    save_vocab_char = save_name + '.vocab_char'
    save_tags_vocab = save_name + '.vocab_tag'
    save_train_config = save_name + '.train_config'

    # TODO: check unkown pos tags
    # TODO: compute unk words
    if is_train:
        sentences_words_train = [w_obj[0] for w_obj in sentences_train]
        vocab = util.build_vocab(sentences_words_train)
        vocab_char = util.build_vocab(util.flatten(sentences_words_train))
        vocab_tags = util.build_tag_vocab(sentences_train)
    elif is_test:
        vocab = util.load_vocab(save_vocab)
        vocab_char = util.load_vocab(save_vocab_char)
        vocab_tags = util.load_vocab(save_tags_vocab)

    PAD_IDX = vocab[PADDING]
    UNK_IDX = vocab[UNKWORD]

    CHAR_PAD_IDX = vocab_char[PADDING]
    CHAR_UNK_IDX = vocab_char[UNKWORD]

    def parse_to_word_ids(sentences):
        return util.parse_to_word_ids(sentences,
                                      xp=xp,
                                      vocab=vocab,
                                      UNK_IDX=UNK_IDX,
                                      idx=0)

    def parse_to_char_ids(sentences):
        return util.parse_to_char_ids(sentences,
                                      xp=xp,
                                      vocab_char=vocab_char,
                                      UNK_IDX=CHAR_UNK_IDX,
                                      idx=0)

    def parse_to_tag_ids(sentences):
        return util.parse_to_tag_ids(sentences,
                                     xp=xp,
                                     vocab=vocab_tags,
                                     UNK_IDX=-1,
                                     idx=-1)

    # if is_train:
    x_train = parse_to_word_ids(sentences_train)
    x_char_train = parse_to_char_ids(sentences_train)
    y_train = parse_to_tag_ids(sentences_train)

    # elif is_test:
    #     x_predict = parse_to_word_ids(sentences_predict)
    #     x_char_predict = parse_to_char_ids(sentences_predict)
    #     y_predict = parse_to_tag_ids(sentences_predict)

    x_dev = parse_to_word_ids(sentences_dev)
    x_char_dev = parse_to_char_ids(sentences_dev)
    y_dev = parse_to_tag_ids(sentences_dev)

    x_test = parse_to_word_ids(sentences_test)
    x_char_test = parse_to_char_ids(sentences_test)
    y_test = parse_to_tag_ids(sentences_test)

    cnt_train_unk = sum([xp.sum(d == UNK_IDX) for d in x_train])
    cnt_train_word = sum([d.size for d in x_train])
    unk_train_unk_rate = float(cnt_train_unk) / cnt_train_word

    cnt_dev_unk = sum([xp.sum(d == UNK_IDX) for d in x_dev])
    cnt_dev_word = sum([d.size for d in x_dev])
    unk_dev_unk_rate = float(cnt_dev_unk) / max(cnt_dev_word, 1)

    logging.info('train:' + str(len(x_train)))
    logging.info('dev  :' + str(len(x_dev)))
    logging.info('test :' + str(len(x_test)))
    logging.info('vocab     :' + str(len(vocab)))
    logging.info('vocab_tags:' + str(len(vocab_tags)))
    logging.info('unk count (train):' + str(cnt_train_unk))
    logging.info('unk rate  (train):' + str(unk_train_unk_rate))
    logging.info('cnt all words (train):' + str(cnt_train_word))
    logging.info('unk count (dev):' + str(cnt_dev_unk))
    logging.info('unk rate  (dev):' + str(unk_dev_unk_rate))
    logging.info('cnt all words (dev):' + str(cnt_dev_word))
    # show model config
    logging.info('######################')
    logging.info('## Model Config')
    logging.info('model_name:' + str(model_name))
    logging.info('batchsize:' + str(batchsize))
    logging.info('optimizer:' + str(optimizer_name))
    # Save model config
    logging.info('######################')
    logging.info('## Model Save Config')
    logging.info('save_dir :' + str(save_dir))

    # save vocab
    logging.info('save_vocab        :' + save_vocab)
    logging.info('save_vocab_char   :' + save_vocab_char)
    logging.info('save_tags_vocab   :' + save_tags_vocab)
    logging.info('save_train_config :' + save_train_config)
    util.write_vocab(save_vocab, vocab)
    util.write_vocab(save_vocab_char, vocab_char)
    util.write_vocab(save_tags_vocab, vocab_tags)
    util.write_vocab(save_train_config, args)

    net = BiLSTM_CNN_CRF(n_vocab=len(vocab),
                         n_char_vocab=len(vocab_char),
                         emb_dim=args['n_word_emb'],
                         hidden_dim=args['n_hidden'],
                         n_layers=args['n_layer'],
                         init_emb=None,
                         n_label=len(vocab_tags))

    if args['word_emb_file']:
        # set Pre-trained embeddings
        # emb_file = './emb/glove.6B.100d.txt'
        emb_file = args['word_emb_file']
        word_ids, word_vecs = util.load_glove_embedding(emb_file, vocab)
        net.word_embed.W.data[word_ids] = word_vecs

    if args['gpu'] >= 0:
        net.to_gpu()

    init_alpha = args['init_lr']
    if optimizer_name == 'adam':
        opt = optimizers.Adam(alpha=init_alpha, beta1=0.9, beta2=0.9)
    elif optimizer_name == 'adadelta':
        opt = optimizers.AdaDelta()
    if optimizer_name == 'sgd_mom':
        opt = optimizers.MomentumSGD(lr=init_alpha, momentum=0.9)
    if optimizer_name == 'sgd':
        opt = optimizers.SGD(lr=init_alpha)

    opt.setup(net)
    opt.add_hook(chainer.optimizer.GradientClipping(5.0))

    def eval_loop(x_data, x_char_data, y_data):
        # dev or test
        net.set_train(train=False)
        iteration_list = range(0, len(x_data), batchsize)
        # perm = np.random.permutation(len(x_data))
        sum_loss = 0.0
        predict_lists = []
        for i_index, index in enumerate(iteration_list):
            x = x_data[index:index + batchsize]
            x_char = x_char_data[index:index + batchsize]
            target_y = y_data[index:index + batchsize]

            output = net(x_data=x, x_char_data=x_char)
            predict, loss = net.predict(output, target_y)

            sum_loss += loss.data
            predict_lists.extend(predict)
        return predict_lists, sum_loss

    if is_test:
        # predict
        model_filename = args['model_filename']
        model_filename = save_dir + model_filename
        serializers.load_hdf5(model_filename, net)

        vocab_tags_inv = dict([(v, k) for k, v in vocab_tags.items()])
        x_predict = x_train
        x_char_predict = x_char_train
        y_predict = y_train
        predict_pairs, _ = eval_loop(x_predict, x_char_predict, y_predict)
        _, predict_tags = zip(*predict_pairs)
        predicted_output = args['predicted_output']
        predicted_results = []
        for predict in predict_tags:
            predicted = [
                vocab_tags_inv[tag_idx] for tag_idx in to_cpu(predict)
            ]
            predicted_results.append(predicted)

        f = open(predicted_output, 'w')
        for predicted in predicted_results:
            for tag in predicted:
                f.write(tag + '\n')
            f.write('\n')
        f.close()

        return False

    tmax = args['max_iter']
    t = 0.0
    for epoch in xrange(args['max_iter']):

        # train
        net.set_train(train=True)
        iteration_list = range(0, len(x_train), batchsize)
        perm = np.random.permutation(len(x_train))
        sum_loss = 0.0
        predict_train = []
        for i_index, index in enumerate(iteration_list):
            data = [(x_train[i], x_char_train[i], y_train[i])
                    for i in perm[index:index + batchsize]]
            x, x_char, target_y = zip(*data)

            output = net(x_data=x, x_char_data=x_char)
            predict, loss = net.predict(output, target_y)

            # loss
            sum_loss += loss.data

            # update
            net.zerograds()
            loss.backward()
            opt.update()

            predict_train.extend(predict)

        # Evaluation
        train_accuracy = util.eval_accuracy(predict_train)

        logging.info('epoch:' + str(epoch))
        logging.info(' [train]')
        logging.info('  loss     :' + str(sum_loss))
        logging.info('  accuracy :' + str(train_accuracy))

        # Dev
        predict_dev, loss_dev = eval_loop(x_dev, x_char_dev, y_dev)

        # Evaluation
        dev_accuracy = util.eval_accuracy(predict_dev)
        logging.info(' [dev]')
        logging.info('  loss     :' + str(loss_dev))
        logging.info('  accuracy :' + str(dev_accuracy))

        # Save model
        model_filename = save_name + '_epoch' + str(epoch)
        serializers.save_hdf5(model_filename + '.model', net)
        serializers.save_hdf5(model_filename + '.state', opt)
Esempio n. 10
0
def run(data_file, is_train=False, **args):
    is_test = not is_train
    batchsize = args['batchsize']
    model_name = args['model_name']
    optimizer_name = args['optimizer']
    save_dir = args['save_dir']
    print args
    if save_dir[-1] != '/':
        save_dir = save_dir + '/'

    # TODO: check save_dir exist
    if not os.path.isdir(save_dir):
        err_msg = 'There is no dir : {}\n'.format(save_dir)
        err_msg += '##############################\n'
        err_msg += '## Please followiing: \n'
        err_msg += '## $ mkdir {}\n'.format(save_dir)
        err_msg += '##############################\n'
        raise ValueError(err_msg)

    save_name = args['save_name']
    if save_name == '':
        save_name = '_'.join([model_name, optimizer_name])

    save_name = save_dir + save_name

    xp = cuda.cupy if args['gpu'] >= 0 else np
    efficient_gpu = False
    if args['gpu'] >= 0:
        cuda.get_device(args['gpu']).use()
        xp.random.seed(1234)
        efficient_gpu = args.get('efficient_gpu', False)

    def to_gpu(x):
        if args['gpu'] >= 0:
            return chainer.cuda.to_gpu(x)
        return x

    # load files
    dev_file = args['dev_file']
    test_file = args['test_file']
    delimiter = args['delimiter']
    input_idx = map(int, args['input_idx'].split(','))
    output_idx = map(int, args['output_idx'].split(','))
    word_input_idx = input_idx[0]  # NOTE: word_idx is first column!
    additional_input_idx = input_idx[1:]
    sentences_train = []
    if is_train:
        sentences_train = util.read_conll_file(filename=data_file,
                                               delimiter=delimiter)
        if len(sentences_train) == 0:
            s = str(len(sentences_train))
            err_msg = 'Invalid training sizes: {} sentences. '.format(s)
            raise ValueError(err_msg)
    else:
        # Predict
        sentences_train = util.read_raw_file(filename=data_file,
                                             delimiter=u' ')

    # sentences_train = sentences_train[:100]

    sentences_dev = []
    sentences_test = []
    if dev_file:
        sentences_dev = util.read_conll_file(dev_file, delimiter=delimiter)
    if test_file:
        sentences_test = util.read_conll_file(test_file, delimiter=delimiter)

    save_vocab = save_name + '.vocab'
    save_vocab_char = save_name + '.vocab_char'
    save_tags_vocab = save_name + '.vocab_tag'
    save_train_config = save_name + '.train_config'

    # TODO: check unkown pos tags
    # TODO: compute unk words
    vocab_adds = []
    if is_train:
        sentences_words_train = [[w_obj[word_input_idx] for w_obj in sentence]
                                 for sentence in sentences_train]
        vocab = util.build_vocab(sentences_words_train)
        vocab_char = util.build_vocab(util.flatten(sentences_words_train))
        vocab_tags = util.build_tag_vocab(sentences_train)

        # Additional setup
        for ad_feat_id in additional_input_idx:
            sentences_additional_train = [[
                feat_obj[ad_feat_id] for feat_obj in sentence
            ] for sentence in sentences_train]
            vocab_add = util.build_vocab(sentences_additional_train)
            vocab_adds.append(vocab_add)
    elif is_test:
        vocab = util.load_vocab(save_vocab)
        vocab_char = util.load_vocab(save_vocab_char)
        vocab_tags = util.load_vocab(save_tags_vocab)

    if args.get('word_emb_file', False):
        # set Pre-trained embeddings
        # emb_file = './emb/glove.6B.100d.txt'
        emb_file = args['word_emb_file']
        word_emb_vocab_type = args.get('word_emb_vocab_type')

        def assert_word_emb_shape(shape1, shape2):
            err_msg = '''Pre-trained embedding size is not equal to `--n_word_emb` ({} != {})'''
            if shape1 != shape2:
                err_msg = err_msg.format(str(shape1), str(shape2))
                raise ValueError(err_msg)

        def assert_no_emb(word_vecs):
            err_msg = '''There is no-embeddings! Please check your file `--word_emb_file`'''
            if word_vecs.shape[0] == 0:
                raise ValueError(err_msg)

        if word_emb_vocab_type == 'replace_all':
            # replace all vocab by Pre-trained embeddings
            word_vecs, vocab_glove = util.load_glove_embedding_include_vocab(
                emb_file)
            vocab = vocab_glove
        elif word_emb_vocab_type == 'replace_only':
            word_ids, word_vecs = util.load_glove_embedding(emb_file, vocab)
            assert_no_emb(word_vecs)

        elif word_emb_vocab_type == 'additional':
            word_vecs, vocab_glove = util.load_glove_embedding_include_vocab(
                emb_file)
            additional_vecs = []
            for word, word_idx in sorted(vocab_glove.items(),
                                         key=lambda x: x[1]):
                if word not in vocab:
                    vocab[word] = len(vocab)
                    additional_vecs.append(word_vecs[word_idx])
            additional_vecs = np.array(additional_vecs, dtype=np.float32)

    if args.get('vocab_file', False):
        vocab_file = args['vocab_file']
        vocab = util.load_vocab(vocab_file)

    if args.get('vocab_char_file', False):
        vocab_char_file = args['vocab_char_file']
        vocab_char = util.load_vocab(vocab_char_file)

    vocab_tags_inv = dict((v, k) for k, v in vocab_tags.items())
    PAD_IDX = vocab[PADDING]
    UNK_IDX = vocab[UNKWORD]

    CHAR_PAD_IDX = vocab_char[PADDING]
    CHAR_UNK_IDX = vocab_char[UNKWORD]

    tmp_xp = xp
    if efficient_gpu:
        tmp_xp = np  # use CPU (numpy)

    def parse_to_word_ids(sentences, word_input_idx, vocab):
        return util.parse_to_word_ids(sentences,
                                      xp=tmp_xp,
                                      vocab=vocab,
                                      UNK_IDX=UNK_IDX,
                                      idx=word_input_idx)

    def parse_to_char_ids(sentences):
        return util.parse_to_char_ids(sentences,
                                      xp=tmp_xp,
                                      vocab_char=vocab_char,
                                      UNK_IDX=CHAR_UNK_IDX,
                                      idx=word_input_idx)

    def parse_to_tag_ids(sentences):
        return util.parse_to_tag_ids(sentences,
                                     xp=tmp_xp,
                                     vocab=vocab_tags,
                                     UNK_IDX=-1,
                                     idx=-1)

    x_train = parse_to_word_ids(sentences_train, word_input_idx, vocab)
    x_char_train = parse_to_char_ids(sentences_train)
    y_train = parse_to_tag_ids(sentences_train)
    x_train_additionals = [
        parse_to_word_ids(sentences_train, ad_feat_id, vocab_adds[i])
        for i, ad_feat_id in enumerate(additional_input_idx)
    ]

    x_dev = parse_to_word_ids(sentences_dev, word_input_idx, vocab)
    x_char_dev = parse_to_char_ids(sentences_dev)
    y_dev = parse_to_tag_ids(sentences_dev)
    x_dev_additionals = [
        parse_to_word_ids(sentences_dev, ad_feat_id, vocab_adds[i])
        for i, ad_feat_id in enumerate(additional_input_idx)
    ]

    y_dev_cpu = [[w[-1] for w in sentence] for sentence in sentences_dev]
    # tag_names = []
    tag_names = list(
        set([
            tag[2:] if len(tag) >= 2 else tag[0] for tag in vocab_tags.keys()
        ]))

    x_test = parse_to_word_ids(sentences_test, word_input_idx, vocab)
    x_char_test = parse_to_char_ids(sentences_test)
    y_test = parse_to_tag_ids(sentences_test)
    x_test_additionals = [
        parse_to_word_ids(sentences_test, ad_feat_id, vocab_adds[i])
        for i, ad_feat_id in enumerate(additional_input_idx)
    ]

    cnt_train_unk = sum([tmp_xp.sum(d == UNK_IDX) for d in x_train])
    cnt_train_word = sum([d.size for d in x_train])
    unk_train_unk_rate = float(cnt_train_unk) / cnt_train_word

    cnt_dev_unk = sum([tmp_xp.sum(d == UNK_IDX) for d in x_dev])
    cnt_dev_word = sum([d.size for d in x_dev])
    unk_dev_unk_rate = float(cnt_dev_unk) / max(cnt_dev_word, 1)

    logging.info('train:' + str(len(x_train)))
    logging.info('dev  :' + str(len(x_dev)))
    logging.info('test :' + str(len(x_test)))
    logging.info('vocab     :' + str(len(vocab)))
    logging.info('vocab_tags:' + str(len(vocab_tags)))
    logging.info('unk count (train):' + str(cnt_train_unk))
    logging.info('unk rate  (train):' + str(unk_train_unk_rate))
    logging.info('cnt all words (train):' + str(cnt_train_word))
    logging.info('unk count (dev):' + str(cnt_dev_unk))
    logging.info('unk rate  (dev):' + str(unk_dev_unk_rate))
    logging.info('cnt all words (dev):' + str(cnt_dev_word))
    # show model config
    logging.info('######################')
    logging.info('## Model Config')
    logging.info('model_name:' + str(model_name))
    logging.info('batchsize:' + str(batchsize))
    logging.info('optimizer:' + str(optimizer_name))
    # Save model config
    logging.info('######################')
    logging.info('## Model Save Config')
    logging.info('save_dir :' + str(save_dir))

    # save vocab
    logging.info('save_vocab        :' + save_vocab)
    logging.info('save_vocab_char   :' + save_vocab_char)
    logging.info('save_tags_vocab   :' + save_tags_vocab)
    logging.info('save_train_config :' + save_train_config)

    init_emb = None

    if is_train:
        util.write_vocab(save_vocab, vocab)
        util.write_vocab(save_vocab_char, vocab_char)
        util.write_vocab(save_tags_vocab, vocab_tags)
        util.write_vocab(save_train_config, args)

    n_vocab_add = [len(_vadd) for _vadd in vocab_adds]

    net = BiLSTM_CNN_CRF(n_vocab=len(vocab),
                         n_char_vocab=len(vocab_char),
                         emb_dim=args['n_word_emb'],
                         hidden_dim=args['n_hidden'],
                         n_layers=args['n_layer'],
                         init_emb=init_emb,
                         char_input_dim=args['n_char_emb'],
                         char_hidden_dim=args['n_char_hidden'],
                         n_label=len(vocab_tags),
                         n_add_feature_dim=args['n_add_feature_emb'],
                         n_add_feature=len(n_vocab_add),
                         n_vocab_add=n_vocab_add,
                         use_cudnn=args['use_cudnn'])
    my_cudnn(args['use_cudnn'])

    if args.get('word_emb_file', False):

        if word_emb_vocab_type == 'replace_all':
            # replace all vocab by Pre-trained embeddings
            assert_word_emb_shape(word_vecs.shape[1],
                                  net.word_embed.W.shape[1])
            net.word_embed.W.data = word_vecs[:]
        elif word_emb_vocab_type == 'replace_only':
            assert_no_emb(word_vecs)
            assert_word_emb_shape(word_vecs.shape[1],
                                  net.word_embed.W.shape[1])
            net.word_embed.W.data[word_ids] = word_vecs[:]

        elif word_emb_vocab_type == 'additional':
            assert_word_emb_shape(word_vecs.shape[1],
                                  net.word_embed.W.shape[1])
            v_size = additional_vecs.shape[0]
            net.word_embed.W.data[-v_size:] = additional_vecs[:]

    if args.get('return_model', False):
        return net

    if args['gpu'] >= 0:
        net.to_gpu()

    init_alpha = args['init_lr']
    if optimizer_name == 'adam':
        opt = optimizers.Adam(alpha=init_alpha, beta1=0.9, beta2=0.9)
    elif optimizer_name == 'adadelta':
        opt = optimizers.AdaDelta()
    if optimizer_name == 'sgd_mom':
        opt = optimizers.MomentumSGD(lr=init_alpha, momentum=0.9)
    if optimizer_name == 'sgd':
        opt = optimizers.SGD(lr=init_alpha)

    opt.setup(net)
    opt.add_hook(chainer.optimizer.GradientClipping(5.0))

    def eval_loop(x_data, x_char_data, y_data, x_train_additionals=[]):
        # dev or test
        net.set_train(train=False)
        iteration_list = range(0, len(x_data), batchsize)
        # perm = np.random.permutation(len(x_data))
        sum_loss = 0.0
        predict_lists = []
        for i_index, index in enumerate(iteration_list):
            x = x_data[index:index + batchsize]
            x_char = x_char_data[index:index + batchsize]
            target_y = y_data[index:index + batchsize]

            if efficient_gpu:
                x = [to_gpu(_) for _ in x]
                x_char = [[to_gpu(_) for _ in words] for words in x_char]
                target_y = [to_gpu(_) for _ in target_y]

            x_additional = []
            if len(x_train_additionals):
                x_additional = [[
                    to_gpu(_) for _ in x_ad[index:index + batchsize]
                ] for x_ad in x_train_additionals]

            output = net(x_data=x,
                         x_char_data=x_char,
                         x_additional=x_additional)
            predict, loss = net.predict(output, target_y)

            sum_loss += loss.data
            predict_lists.extend(predict)

        _, predict_tags = zip(*predict_lists)
        predicted_results = []
        for predict in predict_tags:
            predicted = [
                vocab_tags_inv[tag_idx] for tag_idx in to_cpu(predict)
            ]
            predicted_results.append(predicted)

        return predict_lists, sum_loss, predicted_results

    if args['model_filename']:
        model_filename = args['model_filename']
        serializers.load_hdf5(model_filename, net)

    if is_test:
        # predict
        # model_filename = args['model_filename']
        # model_filename = save_dir + model_filename
        # serializers.load_hdf5(model_filename, net)
        vocab_tags_inv = dict([(v, k) for k, v in vocab_tags.items()])
        x_predict = x_train
        x_char_predict = x_char_train
        y_predict = y_train

        if dev_file:
            predict_dev, loss_dev, predict_dev_tags = eval_loop(
                x_dev, x_char_dev, y_dev)
            gold_predict_pairs = [y_dev_cpu, predict_dev_tags]
            result, phrase_info = util.conll_eval(gold_predict_pairs,
                                                  flag=False,
                                                  tag_class=tag_names)
            all_result = result['All_Result']
            print 'all_result:', all_result

        predict_pairs, _, _tmp = eval_loop(x_predict, x_char_predict,
                                           y_predict)
        _, predict_tags = zip(*predict_pairs)
        predicted_output = args['predicted_output']
        predicted_results = []
        for predict in predict_tags:
            predicted = [
                vocab_tags_inv[tag_idx] for tag_idx in to_cpu(predict)
            ]
            predicted_results.append(predicted)

        f = open(predicted_output, 'w')
        for predicted in predicted_results:
            for tag in predicted:
                f.write(tag + '\n')
            f.write('\n')
        f.close()

        return False

    tmax = args['max_iter']
    t = 0.0
    prev_dev_accuracy = 0.0
    prev_dev_f = 0.0
    for epoch in xrange(args['max_iter']):

        # train
        net.set_train(train=True)
        iteration_list = range(0, len(x_train), batchsize)
        perm = np.random.permutation(len(x_train))
        sum_loss = 0.0
        predict_train = []
        for i_index, index in enumerate(iteration_list):
            data = [(x_train[i], x_char_train[i], y_train[i])
                    for i in perm[index:index + batchsize]]
            x, x_char, target_y = zip(*data)

            x_additional = []
            if len(x_train_additionals):
                x_additional = [[
                    to_gpu(x_ad[add_i])
                    for add_i in perm[index:index + batchsize]
                ] for x_ad in x_train_additionals]

            if efficient_gpu:
                x = [to_gpu(_) for _ in x]
                x_char = [[to_gpu(_) for _ in words] for words in x_char]
                target_y = [to_gpu(_) for _ in target_y]

            output = net(x_data=x,
                         x_char_data=x_char,
                         x_additional=x_additional)
            predict, loss = net.predict(output, target_y)

            # loss
            sum_loss += loss.data

            # update
            net.zerograds()
            loss.backward()
            opt.update()

            predict_train.extend(predict)

        # Evaluation
        train_accuracy = util.eval_accuracy(predict_train)

        logging.info('epoch:' + str(epoch))
        logging.info(' [train]')
        logging.info('  loss     :' + str(sum_loss))
        logging.info('  accuracy :' + str(train_accuracy))

        # Dev
        predict_dev, loss_dev, predict_dev_tags = eval_loop(
            x_dev, x_char_dev, y_dev, x_dev_additionals)

        gold_predict_pairs = [y_dev_cpu, predict_dev_tags]
        result, phrase_info = util.conll_eval(gold_predict_pairs,
                                              flag=False,
                                              tag_class=tag_names)
        all_result = result['All_Result']

        # Evaluation
        dev_accuracy = util.eval_accuracy(predict_dev)
        logging.info(' [dev]')
        logging.info('  loss     :' + str(loss_dev))
        logging.info('  accuracy :' + str(dev_accuracy))
        logging.info('  f_measure :' + str(all_result[-1]))

        dev_f = all_result[-1]

        if prev_dev_f < dev_f:
            logging.info(' [update best model on dev set!]')
            dev_list = [prev_dev_f, dev_f]
            dev_str = '       ' + ' => '.join(map(str, dev_list))
            logging.info(dev_str)
            prev_dev_f = dev_f

            # Save model
            model_filename = save_name + '_epoch' + str(epoch)
            serializers.save_hdf5(model_filename + '.model', net)
            serializers.save_hdf5(model_filename + '.state', opt)
Esempio n. 11
0
# Load data
print("Loading data...")
labels, sentences = load_data_and_labels_from_csv_file(data_file)

params = {'max_chars_features': 500}

lines_chars_level_features = generate_char_level_features(
    sentences, params['max_chars_features'])
params['max_chars_features'] = max(
    [len(lines) for lines in lines_chars_level_features])

lines_chars_level_features = np.array(lines_chars_level_features)

# Build vocabulary
print("Build the vocabulary")
vocabulary = build_vocab(lines_chars_level_features, max_vocab_size=10000)
#print(vocabulary)

# Pad sentence
print("Padding sentences...")
x_text = pad_sentences(lines_chars_level_features,
                       max_sequence_length=params['max_chars_features'])

seq_len = len(x_text[0])
print("The sequence length is: ", seq_len)

# Represent sentence with char index, using char index to represent a sentence
x = text_to_sequence(x_text, vocabulary)

# Shuffle data
#np.random.seed(1) #same shuffling each time