def train():

    word_to_id, id_to_word = load_vocab(args.vocab_file)
    tag_to_id, id_to_tag = load_vocab(args.tag_file)
    processing_word = get_processing_word(word_to_id)
    processing_tag = get_processing_word(tag_to_id, allow_unk=False)

    # load data sets
    train_sentences = LoadDataset(args.train_file, processing_word,
                                  processing_tag)
    dev_sentences = LoadDataset(args.dev_file, processing_word, processing_tag)
    test_sentences = LoadDataset(args.test_file, processing_word,
                                 processing_tag)

    # Use selected tagging scheme (IOB / IOBES)
    # update_tag_scheme(train_sentences, args.tag_schema)
    # update_tag_scheme(test_sentences, args.tag_schema)

    if os.path.isfile(args.config_file):
        config = load_config(args.config_file)
    else:
        config = config_model(word_to_id, tag_to_id, id_to_tag)
        save_config(config, args.config_file)

    make_path(args)
    log_path = os.path.join("log", args.log_file)
    logger = get_logger(log_path)

    with tf.Session() as sess:

        model = create_model(sess, Model, args.ckpt_path, load_word2vec,
                             config, logger)

        model.train(train_sentences, dev_sentences)
 def set_log_dir(self, new_dir):
     self.log_dir = new_dir
     if not os.path.exists(self.log_dir):
         os.makedirs(self.log_dir)
     self.out_dir = os.path.abspath(
         os.path.join(self.log_dir, str(self.version)))
     if not os.path.exists(self.out_dir):
         os.makedirs(self.out_dir)
     self.saved_model_dir = os.path.abspath(
         os.path.join(self.out_dir, "checkpoints"))
     self.checkpoint_prefix = os.path.join(self.saved_model_dir, "model")
     self.train_summary_dir = os.path.join(self.out_dir, "summaries",
                                           "train")
     self.dev_summary_dir = os.path.join(self.out_dir, "summaries", "dev")
     self.logger = get_logger(os.path.join(self.out_dir, 'log'))
Exemple #3
0
def evaluate_line():
    config = load_config(FLAGS.config_file)
    logger = get_logger(FLAGS.log_file)
    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    with open(FLAGS.map_file, "rb") as f:
        char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec,
                             config, id_to_char, logger)
        while True:
            # try:
            #     line = input("请输入测试句子:")
            #     result = model.evaluate_line(sess, input_from_line(line, char_to_id), id_to_tag)
            #     print(result)
            # except Exception as e:
            #     logger.info(e)

            line = input("请输入测试句子:")
            result = model.evaluate_line(sess,
                                         input_from_line(line, char_to_id),
                                         id_to_tag)
            print(result)
Exemple #4
0
from model.text import Vocab
from torch.nn.parallel import DistributedDataParallel
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--config', help='config file', default='config.json')
parser.add_argument('--gpu', help='which gpu to use', type=str, default='4')
parser.add_argument("--local_rank",
                    help='used for distributed training',
                    type=int,
                    default=-1)

args = parser.parse_args()
config = utils.load_config(args.config)
config_path = os.path.dirname(args.config)
logger = utils.get_logger(os.path.join(config_path, 'main.log'))

train_dir = os.path.join(config_path, config['train_dir'])
data_dir = os.path.join(config_path, config['data_dir'])
eval_dir = os.path.join(config_path, config['eval_dir'])
log_dir = os.path.join(config_path, config['log_dir'])
best_model = os.path.join(config_path, config['best_dir'])

os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu


# helpers -----------------------------------------------------
def save_func(epoch, device):
    filename = utils.get_ckpt_filename('model', epoch)
    torch.save(trainer.state_dict(), os.path.join(train_dir, filename))
    if os.path.exists(
Exemple #5
0
def train():
    # load data sets
    train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower,
                                     FLAGS.zeros)
    dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
    test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)

    # Use selected tagging scheme (IOB / IOBES)
    update_tag_scheme(train_sentences, FLAGS.tag_schema)
    update_tag_scheme(test_sentences, FLAGS.tag_schema)

    # create maps if not exist
    if not os.path.isfile(FLAGS.map_file):
        # create dictionary for word
        if FLAGS.pre_emb:
            dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0]
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(
                dico_chars_train.copy(), FLAGS.emb_file,
                list(
                    itertools.chain.from_iterable([[w[0] for w in s]
                                                   for s in test_sentences])))
        else:
            _c, char_to_id, id_to_char = char_mapping(train_sentences,
                                                      FLAGS.lower)

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

    # prepare data, get a collection of list containing index
    train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id,
                                 FLAGS.lower)
    dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id,
                               FLAGS.lower)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id,
                                FLAGS.lower)
    print("%i / %i / %i sentences in train / dev / test." %
          (len(train_data), 0, len(test_data)))

    train_manager = BatchManager(train_data, FLAGS.batch_size)
    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)
    # make path for store log and model if not exist
    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(char_to_id, tag_to_id)
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)

    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch = train_manager.len_data  # the length of batch data
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec,
                             config, id_to_char, logger)
        logger.info("start training")
        loss = []
        for i in range(100):
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True,
                                                  batch)  # step是什么意思?
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                                "NER loss:{:>9.6f}".format(
                                    iteration, step % steps_per_epoch,
                                    steps_per_epoch, np.mean(loss)))
                    loss = []
            # use dev data to validation the model
            best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
            if best:
                save_model(sess, model, FLAGS.ckpt_path, logger)
            # use current the best model to test
            evaluate(sess, model, "test", test_manager, id_to_tag, logger)
Exemple #6
0
@Desc    :
 
'''

import os
import logging
import jieba
import jieba.posseg as pseg
from collections import deque
import pathlib

from model.utils import (get_logger, similarity)

jieba.dt.tmp_dir = "./"
jieba.default_logger.setLevel(logging.ERROR)
logger = get_logger('faqrobot', logfile="faqrobot.log")

basedir = str(pathlib.Path(os.path.abspath(__file__)).parent.parent)

faqdata = basedir + '/data/FAQ.txt'


class zhishiku():
    def __init__(self, q):
        self.q = [q]
        self.a = ""
        self.sim = 0
        self.q_vec = []
        self.q_word = []

    def __str__(self):