コード例 #1
0
def init_emb_weights(shape, id_to_char):
    #emb_shape = (len(id_to_char), 100)
    initializer = Init.xavier_initializer()
    emb_weights = Kb.variable(initializer(shape),
                              dtype=None,
                              name="char_embedding")
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    with tf.Session(config=tf_config) as sess:
        sess.run(tf.global_variables_initializer())
        emb_weights = sess.run(emb_weights.read_value())
        emb_weights = load_word2vec(FLAGS.emb_file, id_to_char, FLAGS.char_dim,
                                    emb_weights)
    return emb_weights
 def __init__(self, params):
     super(SequenceToSequence, self).__init__()
     self.embedding_matrix = load_word2vec(params)
     self.params = params
     self.encoder = rnn_encoder.Encoder(params["vocab_size"],
                                        params["embed_size"],
                                        params["enc_units"],
                                        params["batch_size"],
                                        self.embedding_matrix)
     self.attention = rnn_decoder.BahdanauAttention(params["attn_units"])
     self.decoder = rnn_decoder.Decoder(params["vocab_size"],
                                        params["embed_size"],
                                        params["dec_units"],
                                        params["batch_size"],
                                        self.embedding_matrix)
コード例 #3
0
def eval_model(id_to_char, id_to_tag, test_manager, device, model_name=None):
    print("Eval ......")
    if not model_name:
        model_name = args.log_name
    old_weights = np.random.rand(len(id_to_char), args.word_embed_dim)
    pre_word_embed = load_word2vec("100.utf8", id_to_char, args.word_embed_dim,
                                   old_weights)
    e_model = Model(args, id_to_tag, device, pre_word_embed).to(device)
    e_model.load_state_dict(torch.load("./models/" + model_name + ".pkl"))
    print("model loaded ...")

    e_model.eval()
    all_results = []
    for batch in test_manager.iter_batch():

        strs, lens, chars, segs, subtypes, tags, adj, dep = batch
        chars = torch.LongTensor(chars).to(device)
        _lens = torch.LongTensor(lens).to(device)
        subtypes = torch.LongTensor(subtypes).to(device)
        tags = torch.LongTensor(tags).to(device)
        adj = torch.FloatTensor(adj).to(device)
        dep = torch.LongTensor(dep).to(device)
        logits, _ = e_model(chars, _lens, subtypes, adj, dep)
        """ Evaluate """
        # Decode
        batch_paths = []
        for index in range(len(logits)):
            length = lens[index]
            score = logits[index][:length]  # [seq, dim]
            probs = F.softmax(score, dim=-1)  # [seq, dim]
            path = torch.argmax(probs, dim=-1)  # [seq]
            batch_paths.append(path)

        for i in range(len(strs)):
            result = []
            string = strs[i][:lens[i]]
            gold = iobes_iob([id_to_tag[int(x)] for x in tags[i][:lens[i]]])
            pred = iobes_iob(
                [id_to_tag[int(x)] for x in batch_paths[i][:lens[i]]])
            for char, gold, pred in zip(string, gold, pred):
                result.append(" ".join([char, gold, pred]))
            all_results.append(result)

    all_eval_lines = test_ner(all_results, args.result_path, args.log_name)
    res_info = all_eval_lines[1].strip()
    f1 = float(res_info.split()[-1])
    print("eval: f1: {}".format(f1))
    return f1, res_info
コード例 #4
0
ファイル: model.py プロジェクト: rollben/chinese-nlp-ner
    def get_embedding(self, inputs, id_to_word):
        # embedding layer for input projection
        with tf.variable_scope("Embedding"), tf.device('/cpu:0'):
            if not self.params.pre_emb:
                embedding = tf.get_variable(
                    "word_emb", [self.num_words, self.params.word_dim],
                    initializer=init_ops.uniform_unit_scaling_initializer())
            else:
                print("load word2vec")
                embedding = tf.get_variable(
                    "word_emb",
                    dtype=tf.float32,
                    initializer=np.asarray(load_word2vec(
                        self.params.pre_emb, id_to_word),
                                           dtype=np.float32))

        x = tf.nn.embedding_lookup(embedding, inputs)
        return x
コード例 #5
0
def train(id_to_char, id_to_tag, train_manager, dev_manager, device):
    old_weights = np.random.rand(len(id_to_char), args.word_embed_dim)
    pre_word_embed = load_word2vec("100.utf8", id_to_char, args.word_embed_dim,
                                   old_weights)

    if args.label_weights:
        label_weights = torch.ones([len(id_to_tag)]) * args.label_weights
        label_weights[0] = 1.0  # none
        label_weights = label_weights.to(device)
    else:
        label_weights = None

    model = Model(args, id_to_tag, device, pre_word_embed).to(device)
    if args.optimizer == 'Adam':
        optimizer = torch.optim.Adam(model.parameters(),
                                     lr=args.lr,
                                     weight_decay=args.weight_decay)
    elif args.optimizer == 'SGD':
        optimizer = torch.optim.SGD(model.parameters(),
                                    lr=args.lr,
                                    weight_decay=args.weight_decay)
    print("device: ", model.device)
    MAX_F1 = 0

    for epoch in range(args.epoch):

        log_handler.info("Epoch: {} / {} :".format(epoch + 1, args.epoch))
        log_handler.info("epoch {}, lr: {} ".format(
            epoch + 1, get_learning_rate(optimizer)))

        loss = train_epoch(model, optimizer, train_manager, label_weights,
                           device)
        log_handler.info("epoch {}, loss : {}".format(epoch + 1, loss))
        f1, dev_model = dev_epoch(epoch, model, dev_manager, id_to_tag, device)
        log_handler.info("epoch {}, f1 : {}".format(epoch + 1, f1))
        if f1 > MAX_F1:
            MAX_F1 = f1
            torch.save(dev_model.state_dict(),
                       "./models/{}.pkl".format(args.log_name))
        log_handler.info("epoch {}, MAX_F1: {}\n".format(epoch + 1, MAX_F1))
        print()
コード例 #6
0
    def interface(self, msg):
        ckpt = tf.train.get_checkpoint_state(self.ckpt_path)
        # model = Model(load_config(self.config_file))
        logger = get_logger(self.log_file)

        tf_config = tf.ConfigProto()
        tf_config.gpu_options.allow_growth = True

        with open(self.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

        self.model.saver = tf.train.import_meta_graph(basedir +
                                                      '/ckpt/ner.ckpt.meta')
        sess = tf.Session(config=tf_config)
        # with tf.Session(config=tf_config) as sess:
        with sess.as_default():
            # sess.run(tf.global_variables_initializer())
            if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
                # self.model = tf.saved_model.loader.load(sess, ckpt.model_checkpoint_path)
                # self.model.saver = tf.train.import_meta_graph(basedir + '/ckpt/ner.ckpt.meta')
                self.model.saver.restore(sess, ckpt.model_checkpoint_path)
                # self.model.saver.restore(sess, tf.train.latest_checkpoint(basedir + '/ckpt/'))
            else:
                logger.info("Created model with fresh parameters.")
                sess.run(tf.global_variables_initializer())
                if self.config_file["pre_emb"]:
                    emb_weights = sess.run(self.model.char_lookup.read_value())
                    emb_weights = load_word2vec(self.config_file["emb_file"],
                                                id_to_char,
                                                self.config_file["char_dim"],
                                                emb_weights)
                    sess.run(self.model.char_lookup.assign(emb_weights))
                    logger.info("Load pre-trained embedding.")
            if msg:
                result = self.model.evaluate_line(
                    sess, input_from_line(msg, char_to_id), id_to_tag)

            return result
コード例 #7
0
ファイル: run.py プロジェクト: zhwei1688/NLP-project
    x = import_module('models.' +
                      model_name)  #一个函数运行需要根据不同项目的配置,动态导入对应的配置文件运行。
    config = x.Config(dataset)  #进入到对应模型的__init__方法进行参数初始化
    start_time = time.time()
    print("Loading data...")

    train_data, dev_data, test_data, train_sentences, test_sentences, dev_sentences, word_to_id, id_to_word, tag_to_id, id_to_tag = load_model_dataset(
        config)

    config.n_vocab = len(word_to_id)

    time_dif = data_utils.get_time_dif(start_time)
    print("Time usage:", time_dif)

    embedding_pretrained = data_utils.load_word2vec(config, id_to_word)

    train_X, train_Y = data_utils.get_X_and_Y_data(train_data, config.max_len,
                                                   len(tag_to_id))

    dev_X, dev_Y = data_utils.get_X_and_Y_data(dev_data, config.max_len,
                                               len(tag_to_id))

    test_X, test_Y = data_utils.get_X_and_Y_data(test_data, config.max_len,
                                                 len(tag_to_id))

    train_dataset = tf.data.Dataset.from_tensor_slices((train_X, train_Y))
    train_dataset = train_dataset.shuffle(len(train_X)).batch(
        config.batch_size, drop_remainder=True)

    # train
コード例 #8
0
ファイル: main.py プロジェクト: k141303/tacred_cnn
def main(args=None):
    if args is None:
        args = load_arg()

    args.device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    args.n_gpu = torch.cuda.device_count()

    set_seed(args)

    print_args(args)

    if args.glove is not None:
        embedding_vectors, word2id = load_word2vec(args.glove,
                                                   vocab=args.vocab,
                                                   use_gensim=False)
    else:
        embedding_vectors, word2id = load_word2vec(args.word2vec,
                                                   vocab=args.vocab,
                                                   use_gensim=True)
    train_dataset, dev_dataset, test_dataset = load_tacred_dataset(
        args, word2id)

    if args.entity_mask:
        mask_vectors = torch.randn(
            len(train_dataset.ner_tags) * 2, embedding_vectors.size(1))
        embedding_vectors = torch.cat([embedding_vectors, mask_vectors], dim=0)

    label_weights = train_dataset.label_weights if args.label_weights else None
    model = CNNForRE(args,
                     embedding_vectors,
                     pad_id=train_dataset.pad_id,
                     num_labels=train_dataset.num_labels,
                     label_weights=label_weights)

    do_train = True
    if os.path.exists(f"{args.output}/pytorch_model.bin"):
        model.load_state_dict(
            torch.load(f"{args.output}/pytorch_model.bin", map_location="cpu"))
        do_train = False

    model.to(args.device)

    preds, scores = {}, {}
    model, scores["train"], scores["dev"], preds["train"], preds[
        "dev"], best_epoch = train(args, train_dataset, dev_dataset, model,
                                   do_train)

    test_score = None
    if args.do_eval:
        scores["test"], preds["test"] = eval(args, test_dataset, model)
        print(
            f"|{'TEST':<7}|{scores['test']['precision']:>6.2f}|{scores['test']['recall']:>6.2f}|{scores['test']['f1']:>6.2f}|"
        )

    model.to("cpu")

    if args.output is not None:
        os.makedirs(f"{args.output}/predictions", exist_ok=True)
        save_model(args.output, model)
        save_args(args.output, args)
        save_preds(f"{args.output}/predictions", preds)
        save_json(f"{args.output}/scores.json", scores)

    return model, scores, best_epoch
コード例 #9
0
def do_train(config):
    train, dev, test = load_data(config)  # 加载数据
    word_to_id, id_to_word, tag_to_id, id_to_tag = create_maps(train, config)  # 创建或读取maps

    # 配置信息及保存
    config["num_chars"] = len(word_to_id)  # 词总数
    config["num_tags"] = len(tag_to_id)  # 标签总数
    with open(config["config_file"], "w") as f:
        json.dump(config, f, ensure_ascii=False, indent=4)

    # 数据处理
    train_data = prepare_dataset(train, word_to_id, tag_to_id, config["lower"])
    dev_data = prepare_dataset(dev, word_to_id, tag_to_id, config["lower"])
    test_data = prepare_dataset(test, word_to_id, tag_to_id, config["lower"])

    print("train/dev/test 句子数:{} / {} / {}".format(len(train_data), len(dev_data), len(test_data)))

    # 分batch
    train_manager = BatchManager(train_data, config["batch_size"])
    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)

    steps_per_epoch = train_manager.len_data  # 每个轮次的steps

    # 创建相关路径
    make_path(config)

    # logger
    logger = get_logger(config["log_file"])

    # GPU限制
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    with tf.Session(config=tf_config) as sess:
        # 创建模型, 可以提供使用现有参数配置
        model = Model(config)

        ckpt = tf.train.get_checkpoint_state(config["ckpt_path"])  # 从模型路径获取ckpt
        if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):  # 现有模型
            logger.info("读取现有模型...")
            model.saver.restore(sess, ckpt.model_checkpoint_path)
        else:
            logger.info("新建模型...")
            sess.run(tf.global_variables_initializer())  # 不使用预训练的embeddings

            # 如果使用预训练的embeddings
            if config["pre_emb"]:
                emb_weights = sess.run(model.char_lookup.read_value())
                emb_weights = load_word2vec(config["emb_file"], id_to_word, config["char_dim"], emb_weights)
                sess.run(model.char_lookup.assign(emb_weights))
                logger.info("Load pre-trained embedding.")

        logger.info("开始训练...")
        loss = []
        for i in range(config["max_epoch"]):
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)

                if step % config["steps_check"] == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, NER loss:{:>9.6f}".format(
                        iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss)))

                    loss = []

            best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger, config)
            if best:
                save_model(sess, model, config["ckpt_path"], logger)
            evaluate(sess, model, "test", test_manager, id_to_tag, logger, config)
コード例 #10
0
# build indexes
if not os.path.isfile(indexes_pkl) or not os.path.isfile(indexes_size_pkl):
    log.info("build indexes")
    indexes = {}
    indexes_size = {}
else:
    log.info("previous indexes ({})".format(indexes_pkl))
    indexes = load_from_pkl(indexes_pkl)
    indexes_size = load_from_pkl(indexes_size_pkl)
indexes['words2id'], indexes_size['words2id'] = build_index(train['words'])
indexes['rel_senses2id'], indexes_size['rel_senses2id'] = build_index(train['rel_senses'])
log.info("  " + ", ".join([ "{}: {}".format(k, v) for k, v in indexes_size.items() ]))
save_to_pkl(indexes_pkl, indexes)
save_to_pkl(indexes_size_pkl, indexes_size)

init_weights = load_word2vec(indexes['words2id'], indexes_size['words2id'], words_dim, words2vec_bin, words2vec_txt)

# build model
log.info("build model")
words2id_size = indexes_size['words2id']
rel_senses2id_size = indexes_size['rel_senses2id']

shared_emb = Embedding(input_dim=words2id_size, output_dim=words_dim, weights=init_weights, dropout=words_dropout, mask_zero=True, name="shared_emb")

# input: arg1 word/token ids
arg1_ids = Input(shape=(arg1_len,), dtype='int32', name="arg1_ids")
# shape: (sample, arg1_len) of words2id_size

# input: arg2 word/token ids
arg2_ids = Input(shape=(arg2_len,), dtype='int32', name="arg2_ids")
# shape: (sample, arg2_len) of words2id_size
コード例 #11
0
ファイル: main.py プロジェクト: ypycsy/CDTL-PSE
def train(X_train,X_dev,X_test):
    # load data sets
    train_sentences = X_train
    dev_sentences = X_dev
    test_sentences = X_test

    train_sentences_loc = load_sentences(FLAGS.train_file_loc, FLAGS.lower, FLAGS.zeros)
    dev_sentences_loc = load_sentences(FLAGS.dev_file_loc, FLAGS.lower, FLAGS.zeros)
    test_sentences_loc = load_sentences(FLAGS.test_file_loc, FLAGS.lower, FLAGS.zeros)
    train_sentences_org = load_sentences(FLAGS.train_file_org, FLAGS.lower, FLAGS.zeros)
    dev_sentences_org = load_sentences(FLAGS.dev_file_org, FLAGS.lower, FLAGS.zeros)
    test_sentences_org = load_sentences(FLAGS.test_file_org, FLAGS.lower, FLAGS.zeros)
    train_sentences_per = load_sentences(FLAGS.train_file_per, FLAGS.lower, FLAGS.zeros)
    dev_sentences_per = load_sentences(FLAGS.dev_file_per, FLAGS.lower, FLAGS.zeros)
    test_sentences_per = load_sentences(FLAGS.test_file_per, FLAGS.lower, FLAGS.zeros)

    # Use selected tagging scheme (IOB / IOBES)
    update_tag_scheme(train_sentences, FLAGS.tag_schema)
    update_tag_scheme(test_sentences, FLAGS.tag_schema)

    update_tag_scheme(train_sentences_loc, FLAGS.tag_schema)
    update_tag_scheme(test_sentences_loc, FLAGS.tag_schema)
    update_tag_scheme(train_sentences_per, FLAGS.tag_schema)
    update_tag_scheme(test_sentences_per, FLAGS.tag_schema)
    update_tag_scheme(train_sentences_org, FLAGS.tag_schema)
    update_tag_scheme(test_sentences_org, FLAGS.tag_schema)

    # create maps if not exist
    if not os.path.isfile(FLAGS.map_file):
        # create dictionary for word
        if FLAGS.pre_emb:
            dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0]
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(
                dico_chars_train.copy(),
                FLAGS.emb_file,
                list(itertools.chain.from_iterable(
                    [[w[0] for w in s] for s in test_sentences])
                )
            )
            dico_chars_train_loc = char_mapping(train_sentences_loc, FLAGS.lower)[0]
            dico_chars_loc, char_to_id_loc, id_to_char_loc = augment_with_pretrained(
                dico_chars_train_loc.copy(),
                FLAGS.emb_file,
                list(itertools.chain.from_iterable(
                    [[w[0] for w in s] for s in test_sentences_loc])
                )
            )
            dico_chars_train_per = char_mapping(train_sentences_per, FLAGS.lower)[0]
            dico_chars_per, char_to_id_per, id_to_char_per = augment_with_pretrained(
                dico_chars_train_per.copy(),
                FLAGS.emb_file,
                list(itertools.chain.from_iterable(
                    [[w[0] for w in s] for s in test_sentences_per])
                )
            )
            dico_chars_train_org = char_mapping(train_sentences_org, FLAGS.lower)[0]
            dico_chars_org, char_to_id_org, id_to_char_org = augment_with_pretrained(
                dico_chars_train_org.copy(),
                FLAGS.emb_file,
                list(itertools.chain.from_iterable(
                    [[w[0] for w in s] for s in test_sentences_org])
                )
            )
        else:
            _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower)
            _c_loc, char_to_id_loc, id_to_char_loc = char_mapping(train_sentences_loc, FLAGS.lower)
            _c_per, char_to_id_per, id_to_char_per = char_mapping(train_sentences_per, FLAGS.lower)
            _c_org, char_to_id_org, id_to_char_org = char_mapping(train_sentences_org, FLAGS.lower)

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
        _t_loc, tag_to_id_loc, id_to_tag_loc = tag_mapping(train_sentences_loc)
        _t_per, tag_to_id_per, id_to_tag_per = tag_mapping(train_sentences_per)
        _t_org, tag_to_id_org, id_to_tag_org = tag_mapping(train_sentences_org)
        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag,char_to_id_loc, id_to_char_loc, tag_to_id_loc, id_to_tag_loc,char_to_id_per, id_to_char_per, tag_to_id_per, id_to_tag_per,char_to_id_org, id_to_char_org, tag_to_id_org, id_to_tag_org], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag,char_to_id_loc, id_to_char_loc, tag_to_id_loc, id_to_tag_loc,char_to_id_per, id_to_char_per, tag_to_id_per, id_to_tag_per,char_to_id_org, id_to_char_org, tag_to_id_org, id_to_tag_org = pickle.load(f)

    # prepare data, get a collection of list containing index
    train_data = prepare_dataset(
        train_sentences, char_to_id, tag_to_id, FLAGS.lower
    )
    dev_data = prepare_dataset(
        dev_sentences, char_to_id, tag_to_id, FLAGS.lower
    )
    test_data = prepare_dataset(
        test_sentences, char_to_id, tag_to_id, FLAGS.lower
    )
    print("%i / %i / %i sentences in train / dev / test." % (
        len(train_data),len(dev_data), len(test_data)))
    train_data_loc = prepare_dataset_ner(
        train_sentences_loc, char_to_id_loc, tag_to_id_loc, FLAGS.lower
    )
    dev_data_loc = prepare_dataset_ner(
        dev_sentences_loc, char_to_id_loc, tag_to_id_loc, FLAGS.lower
    )
    test_data_loc = prepare_dataset_ner(
        test_sentences_loc, char_to_id_loc, tag_to_id_loc, FLAGS.lower
    )
    print("%i / %i / %i sentences_loc in train / dev / test." % (
        len(train_data_loc), len(dev_data_loc), len(test_data_loc)))
    train_data_per = prepare_dataset_ner(
        train_sentences_per, char_to_id_per, tag_to_id_per, FLAGS.lower
    )
    dev_data_per = prepare_dataset_ner(
        dev_sentences_per, char_to_id_per, tag_to_id_per, FLAGS.lower
    )
    test_data_per = prepare_dataset_ner(
        test_sentences_per, char_to_id_per, tag_to_id_per, FLAGS.lower
    )
    print("%i / %i / %i sentences_per in train / dev / test." % (
        len(train_data_per), len(dev_data_per), len(test_data_per)))
    train_data_org = prepare_dataset_ner(
        train_sentences_org, char_to_id_org, tag_to_id_org, FLAGS.lower
    )
    dev_data_org = prepare_dataset_ner(
        dev_sentences_org, char_to_id_org, tag_to_id_org, FLAGS.lower
    )
    test_data_org = prepare_dataset_ner(
        test_sentences_org, char_to_id_org, tag_to_id_org, FLAGS.lower
    )
    print("%i / %i / %i sentences_org in train / dev / test." % (
        len(train_data_org), len(dev_data_org), len(test_data_org)))

    train_manager = BatchManager(train_data, FLAGS.batch_size)
    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)

    train_manager_loc = BatchManager(train_data_loc, FLAGS.batch_size)
    train_manager_per = BatchManager(train_data_per, FLAGS.batch_size)
    train_manager_org = BatchManager(train_data_org, FLAGS.batch_size)
    # make path for store log and model if not exist
    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(char_to_id, tag_to_id,char_to_id_loc, tag_to_id_loc,char_to_id_per, tag_to_id_per,char_to_id_org, tag_to_id_org)
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)

    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch = train_manager.len_data
    steps_per_epoch_loc = train_manager_loc.len_data
    steps_per_epoch_per = train_manager_per.len_data
    steps_per_epoch_org = train_manager_org.len_data
    model = create_model(Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, id_to_char_loc, id_to_char_per, id_to_char_org, logger)

    with tf.Session(config=tf_config, graph = model.graph ) as sess:

        sess.run(tf.global_variables_initializer())
        if config["pre_emb"]:
            emb_weights = sess.run(model.char_lookup.read_value())
            emb_weights_ner = sess.run(model.char_lookup.read_value())
            emb_weights, emb_weights_ner = load_word2vec(config["emb_file"], id_to_char, id_to_char_loc,id_to_char_per,id_to_char_org, config["char_dim"],
                                                    emb_weights, emb_weights_ner)
            sess.run(model.char_lookup.assign(emb_weights))
            logger.info("Load pre-trained embedding.")
        logger.info("start training")
        loss = []
        loss_loc = []
        loss_per = []
        loss_org = []
        for i in range(100):
            for batch_loc in train_manager_loc.iter_batch(shuffle=True):
                    step_loc, batch_loss_loc = model.run_step_ner(sess, True, batch_loc)
                    loss_loc.append(batch_loss_loc)
                    if step_loc % FLAGS.steps_check == 0:
                        iteration_loc = step_loc // steps_per_epoch_loc + 1
                        logger.info("iteration:{} step_loc:{}/{}, "
                                    "NER loss:{:>9.6f}".format(
                            iteration_loc, step_loc % steps_per_epoch_loc, steps_per_epoch_loc, np.mean(loss_loc)))
                        loss_loc = []
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration_1 = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                            "SKILL loss:{:>9.6f}".format(
                        iteration_1, step % steps_per_epoch, steps_per_epoch, np.mean(loss)))
                loss = []
            precision_loc_dev = model.precision(sess, dev_manager, id_to_tag)
            precision_loc_test = model.precision(sess, test_manager, id_to_tag)
            for batch_per in train_manager_per.iter_batch(shuffle=True):
                    step_per, batch_loss_per = model.run_step_ner(sess, True, batch_per)
                    loss_per.append(batch_loss_per)
                    if step_per % FLAGS.steps_check == 0:
                        iteration_per = step_per // steps_per_epoch_per + 1
                        logger.info("iteration:{} step_per:{}/{}, "
                                    "NER loss:{:>9.6f}".format(
                            iteration_per, step_per % steps_per_epoch_per, steps_per_epoch_per, np.mean(loss_per)))
                        loss_per = []
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration_2 = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                            "SKILL loss:{:>9.6f}".format(
                        iteration_2, step % steps_per_epoch, steps_per_epoch, np.mean(loss)))
                loss = []
            precision_per_dev = model.precision(sess, dev_manager, id_to_tag)
            precision_per_test = model.precision(sess, test_manager, id_to_tag)
            for batch_org in train_manager_org.iter_batch(shuffle=True):
                    step_org, batch_loss_org = model.run_step_ner(sess, True, batch_org)
                    loss_org.append(batch_loss_org)
                    if step_org % FLAGS.steps_check == 0:
                        iteration_org = step_org // steps_per_epoch_org + 1
                        logger.info("iteration:{} step_org:{}/{}, "
                                    "NER loss:{:>9.6f}".format(
                            iteration_org, step_org % steps_per_epoch_org, steps_per_epoch_org, np.mean(loss_org)))
                        loss_org = []
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration_3 = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                            "SKILL loss:{:>9.6f}".format(
                        iteration_3, step % steps_per_epoch, steps_per_epoch, np.mean(loss)))
                loss = []
            precision_org_dev = model.precision(sess, dev_manager, id_to_tag)
            precision_org_test = model.precision(sess, test_manager, id_to_tag)
            best = evaluate(sess, model, "dev", dev_manager, id_to_tag,precision_loc_dev,precision_per_dev,precision_org_dev, logger)
            if best:
                save_model(sess, model, FLAGS.ckpt_path, logger)
                best_test,results= evaluate(sess, model, "test", test_manager, id_to_tag,precision_loc_test,precision_per_test,precision_org_test, logger)
                with open("CDTL_PSE-result.csv", "a",encoding='utf-8')as st_re:
                    st_re.write(str(results).replace("[", "").replace("]", ""))
                    st_re.write("\n")
コード例 #12
0
ファイル: main.py プロジェクト: fudannlp16/NeuralChineseNER
def train():
    # load data sets
    train_sentences=load_sentences(FLAGS.train_file,FLAGS.zeros)
    dev_sentences=load_sentences(FLAGS.dev_file,FLAGS.zeros)
    test_sentences=load_sentences(FLAGS.test_file,FLAGS.zeros)

    # appoint tagging scheme (IOB/IOBES)
    train_sentences=update_tag_scheme(train_sentences,FLAGS.tag_schema)
    dev_sentences=update_tag_scheme(dev_sentences,FLAGS.tag_schema)
    test_sentences=update_tag_scheme(test_sentences,FLAGS.tag_schema)

    #create maps if not exist
    if not os.path.exists(FLAGS.map_file):
        if FLAGS.pre_emb:
            char_to_id,_=char_mapping(train_sentences)
            char_to_id,id_to_char=augment_with_pretrained(char_to_id,'wiki_100.utf8')
        else:
            char_to_id, id_to_char=char_mapping(train_sentences)
        tag_to_id, id_to_tag=tag_mapping(train_sentences)
        with open(FLAGS.map_file,'wb') as f:
            cPickle.dump([char_to_id,id_to_char,tag_to_id,id_to_tag],f,cPickle.HIGHEST_PROTOCOL)
    else:
        with open(FLAGS.map_file,'rb') as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag=cPickle.load(f)

    # prepare data, get a collection of list containing index
    train_data=prepare_dataset(train_sentences,char_to_id,tag_to_id,True)
    dev_data=prepare_dataset(dev_sentences,char_to_id,tag_to_id,True)
    test_data=prepare_dataset(test_sentences,char_to_id,tag_to_id,True)
    print "%i %i %i sentences in train / dev / test." % (len(train_data),len(dev_data),len(test_data))

    if not FLAGS.pre_emb:
        pre_emb=None
    else:
        pre_emb=load_word2vec(FLAGS.pre_emb_file,char_to_id,FLAGS.char_dim)
        print "init embedding shape: (%d,%d)" %(pre_emb.shape[0],pre_emb.shape[1])

    train_manager=BatchManager(train_data,FLAGS.batch_size,True)
    dev_manager=BatchManager(dev_data,FLAGS.batch_size,False)
    test_manager=BatchManager(test_data,FLAGS.batch_size,False)

    config=BasicModelConfig(FLAGS,len(char_to_id),len(tag_to_id),4)
    tfConfig = tf.ConfigProto()
    tfConfig.gpu_options.per_process_gpu_memory_fraction = FLAGS.memory_usage
    with tf.Session(config=tfConfig) as sess:
        print "Train started!"
        model=BasicModel(config,pre_emb)
        saver=tf.train.Saver()

        # tensorboard
        if not os.path.exists(FLAGS.summaries_dir):
            os.mkdir(FLAGS.summaries_dir)
        merged=tf.summary.merge_all()
        train_writer=tf.summary.FileWriter(os.path.join(FLAGS.summaries_dir,FLAGS.model_name,"train"),sess.graph)
        test_writer=tf.summary.FileWriter(os.path.join(FLAGS.summaries_dir,FLAGS.model_name,"test"),sess.graph)

        # load previous trained model or create a new model
        if not os.path.exists(FLAGS.checkpoints):
            os.mkdir(FLAGS.checkpoints)
        model_name=os.path.join(FLAGS.checkpoints,FLAGS.model_name)
        ckpt=tf.train.get_checkpoint_state(FLAGS.checkpoints)
        if ckpt and ckpt.model_checkpoint_path:
            print "restore from previous traied model: %s" % FLAGS.model_name
            saver.restore(sess,ckpt.model_checkpoint_path)
        else:
            sess.run(tf.global_variables_initializer())

        def evaluate(sess,model,manager):
            strings=[]
            predicts=[]
            goldens=[]
            bar = ProgressBar(max_value=manager.num_batch)
            for batch in bar(manager.iter_batch()):
                batch_string,batch_predict,batch_golden=model.evaluate_step(sess,batch)
                strings.extend(batch_string)
                predicts.extend(batch_predict)
                goldens.extend(batch_golden)
            return strings,predicts,goldens

        best_eval_f1=0
        noimpro_num=0
        for i in range(FLAGS.max_epoch):
            #train
            train_loss=[]
            bar = ProgressBar(max_value=train_manager.num_batch)
            for step,batch in bar(enumerate(train_manager.iter_batch())):
                batch.append(merged)
                summary,global_step,batch_loss=model.train_step(sess,batch,FLAGS.dropout_keep)
                #add summary to tensorboard
                train_writer.add_summary(summary,global_step)
                train_loss.append(batch_loss)
            print "Epoch %d Train loss is %.4f" % (i+1,np.mean(train_loss))

            #dev
            strings,predicts,goldens=evaluate(sess,model,dev_manager)
            eval_f1=report_results(strings,predicts,goldens,id_to_char,id_to_tag,'outputs/dev')
            if eval_f1>best_eval_f1:
                best_eval_f1=eval_f1
                noimpro_num=0
                saver.save(sess,model_name)
            else:
                noimpro_num+=1
            print "Epoch %d Best eval f1:%.6f" % (i+1,best_eval_f1)

            #test
            strings,predicts,goldens=evaluate(sess,model,test_manager)
            test_f1=report_results(strings,predicts,goldens,id_to_char,id_to_tag,'outputs/test',True)
            #early_stop
            if noimpro_num>=3:
                print "Early stop! Final F1 scores on test data is :%.6f" % test_f1
                break
            print
コード例 #13
0
    def __init__(self):
        # load word embedding
        glove = data_utils.load_glove(FLAGS.glove_file)
        word2vec = data_utils.load_word2vec(FLAGS.word2vec_file)
        merged_embed, self.vocab_size = self.merge_glove_word2vec(
            glove, word2vec)
        dim = len(merged_embed[0])
        merged_embed.append([0. for _ in xrange(dim)])

        # load doc embedding
        self.doc_embedding, doc_dim = data_utils.load_fastText_embed(\
          FLAGS.fastText_doc_file, FLAGS.fastText_vector_file)
        self.zero_doc_key = self.doc_key([self.vocab_size], [self.vocab_size])
        self.doc_embedding[self.zero_doc_key] = [0. for _ in xrange(doc_dim)]

        FLAGS.fc_units = map(int, FLAGS.fc_units.split(','))

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        self.session = tf.Session(config=config)
        ''' graph '''
        print 'Initializing model graph...'
        with tf.variable_scope('inputs'):
            self.training = tf.placeholder(tf.bool, name='training')

            self.title = tf.placeholder(
                tf.int32, shape=[None, None],
                name='title')  # [batch size, sequence length]
            self.content = tf.placeholder(tf.int32,
                                          shape=[None, None],
                                          name='content')
            self.title_length = tf.placeholder(tf.int32, shape=[None], \
              name='title_length')
            self.content_length = tf.placeholder(tf.int32, shape=[None],\
              name='content_length')

            self.prices = tf.placeholder(tf.float32, name='prices', \
              shape=[None, None, 7])
            self.price_length = tf.placeholder(tf.int32, shape=[None], \
              name='price_length')

            self.docs = tf.placeholder(tf.float32, name='docs', \
              shape=[None, None, doc_dim])
            self.doc_length = tf.placeholder(tf.int32, shape=[None], \
              name='doc_length')

            self.label = tf.placeholder(tf.int32,
                                        shape=[None, 2],
                                        name='label')

        with tf.variable_scope('birnn_embed'):
            self.word_embedding = tf.Variable(merged_embed,
                                              dtype=tf.float32,
                                              name='word_embedding_matrix')
            title_embed = self.embed_birnn(FLAGS.title_units,
                                           FLAGS.title_layers,
                                           self.title,
                                           self.title_length,
                                           scope='title_embed_birnn')
            content_embed = self.embed_birnn(FLAGS.content_units,
                                             FLAGS.content_layers,
                                             self.content,
                                             self.content_length,
                                             scope='content_embed_birnn')
            price_embed = self.birnn(FLAGS.price_units,
                                     FLAGS.price_layers,
                                     self.prices,
                                     self.price_length,
                                     scope='price_birnn')
            doc_embed = self.birnn(FLAGS.doc_units,
                                   FLAGS.doc_layers,
                                   self.docs,
                                   self.doc_length,
                                   scope='doc_birnn')
            final_embed = tf.concat(
                [title_embed, content_embed, doc_embed, price_embed], 1)

        with tf.variable_scope('full_connect'):
            fc_inputs = final_embed
            for i in range(FLAGS.fc_layers):
                with tf.variable_scope('full_connect_layer_%d' % i):
                    fc_outputs = tf.contrib.layers.legacy_fully_connected(
                        fc_inputs,
                        FLAGS.fc_units[i],
                        activation_fn=tf.nn.relu,
                        weight_regularizer=tf.contrib.layers.l2_regularizer(
                            FLAGS.l2_coef))
                    fc_inputs = fc_outputs

        with tf.variable_scope('dropout'):
            dropout = tf.layers.dropout(fc_outputs, training=self.training)

        with tf.variable_scope('output'):
            W = tf.get_variable('W',
                                shape=[FLAGS.fc_units[-1], 2],
                                initializer=tf.truncated_normal_initializer())
            biases = tf.get_variable(
                'biases',
                shape=[2],
                initializer=tf.random_normal_initializer())
            logits = tf.matmul(dropout, W) + biases
            self.result = tf.nn.softmax(logits)

        with tf.variable_scope('train'):
            self.cross_entropy = tf.reduce_mean(
                tf.nn.softmax_cross_entropy_with_logits(labels=self.label,
                                                        logits=logits))

            self.learning_rate = tf.Variable(FLAGS.init_lr,
                                             trainable=False,
                                             name="learning_rate")
            self.lr_decay_op = self.learning_rate.assign(self.learning_rate *
                                                         FLAGS.lr_decay)

            self.global_step = tf.Variable(0,
                                           trainable=False,
                                           name='global_step')
            self.train_op = tf.train.AdamOptimizer(FLAGS.init_lr) \
                .minimize(self.cross_entropy, self.global_step)

        with tf.variable_scope('logs'):
            self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=100)
            self.log_writer = tf.summary.FileWriter(
                os.path.join(FLAGS.train_dir, 'logs/'), self.session.graph)
            self.summary = tf.Summary()