def main(config):
    # load vocabs
    vocab_words = load_vocab(config.words_filename)
    vocab_tags  = load_vocab(config.tags_filename)
    vocab_chars = load_vocab(config.chars_filename)

    # get processing functions
    processing_word = get_processing_word(vocab_words, vocab_chars,
                    lowercase=True, chars=config.chars)
    processing_tag  = get_processing_word(vocab_tags, 
                    lowercase=False, allow_unk=False)

    # get pre trained embeddings
    embeddings = get_trimmed_glove_vectors(config.trimmed_filename)

    # create dataset
    dev   = CoNLLDataset(config.dev_filename, processing_word,
                        processing_tag, config.max_iter)
    test  = CoNLLDataset(config.test_filename, processing_word,
                        processing_tag, config.max_iter)
    train = CoNLLDataset(config.train_filename, processing_word,
                        processing_tag, config.max_iter)

    # build model
    model = NERModel(config, embeddings, ntags=len(vocab_tags),
                                         nchars=len(vocab_chars))
    model.build()

    # train, evaluate and interact
    model.train(train, dev, vocab_tags)
    model.evaluate(test, vocab_tags)
    model.interactive_shell(vocab_tags, processing_word)
Exemple #2
0
def main(config):
    # load vocabs
    vocab_words = load_vocab(config.words_filename)
    vocab_mor_tags = load_vocab(config.mor_tags_filename)
    vocab_tags = load_vocab(config.tags_filename)
    vocab_chars = load_vocab(config.chars_filename)
    vocab_lex_tags = load_vocab(config.lex_tags_filename)

    # get processing functions
    processing_word = get_processing_word(vocab_words,
                                          vocab_chars,
                                          lowercase=True,
                                          chars=config.chars)
    processing_mor_tag = get_processing_word(vocab_mor_tags, lowercase=False)
    processing_tag = get_processing_word(vocab_tags, lowercase=False)
    processing_lex_tag = get_processing_word(vocab_lex_tags, lowercase=False)

    # get pre trained embeddings
    embeddings = get_trimmed_glove_vectors(config.trimmed_filename)

    # create dataset
    dev = Data(config.dev_filename, processing_word, processing_mor_tag,
               processing_lex_tag, processing_tag, config.max_iter)
    test = Data(config.test_filename, processing_word, processing_mor_tag,
                processing_lex_tag, processing_tag, config.max_iter)
    train = Data(config.train_filename, processing_word, processing_mor_tag,
                 processing_lex_tag, processing_tag, config.max_iter)

    cnn_model = CnnLstmCrfModel(config,
                                embeddings,
                                ntags=len(vocab_tags),
                                nchars=len(vocab_chars))
    cnn_model.build()
    cnn_model.train(train, dev, vocab_tags)
    cnn_model.evaluate(test, vocab_tags)
Exemple #3
0
def doeval():
    parser = argparse.ArgumentParser(description='Text CNN 分类器')
    #必须指定已经训练好的模型
    parser.add_argument('--model',
                        type=str,
                        default="model/textcnn.model",
                        help='读取model进行评估')
    conf = Config()
    #打印模型配置信息
    conf.dump()
    args = parser.parse_args()
    print("加载测试数据")
    #测试时不进行数据打乱操作
    eval_iter, text_field, label_field = data_utils.text_dataloader(
        conf.eval_dir, conf.batch_size, shuffle=False)
    # 模型加载和初始化
    if os.path.exists(args.model):
        print('发现模型文件, 加载模型: {}'.format(args.model))
        cnn = torch.load(args.model)
    else:
        print("未找到模型文件,退出")
        sys.exit(-1)
    #加载以保存的字典
    text_field.vocab = data_utils.load_vocab("model/text.vocab")
    label_field.vocab = data_utils.load_vocab("model/label.vocab")
    #开始模型评估
    model_utils.eval(eval_iter, cnn, conf)
def main(config):
    # load vocabs
    vocab_words = load_vocab(config.words_filename)
    vocab_mor_tags = load_vocab(config.mor_tags_filename)
    vocab_tags = load_vocab(config.tags_filename)
    vocab_chars = load_vocab(config.chars_filename)
    vocab_lex_tags = load_vocab(config.lex_tags_filename)

    # get processing functions
    processing_word = get_processing_word(vocab_words,
                                          vocab_chars,
                                          lowercase=True,
                                          chars=config.chars)
    processing_mor_tag = get_processing_word(vocab_mor_tags, lowercase=False)
    processing_tag = get_processing_word(vocab_tags, lowercase=False)
    processing_lex_tag = get_processing_word(vocab_lex_tags, lowercase=False)

    # get pre trained embeddings
    embeddings = get_trimmed_glove_vectors(config.trimmed_filename)

    cnn_model = CnnLstmCrfModel(config,
                                embeddings,
                                ntags=len(vocab_tags),
                                nchars=len(vocab_chars))
    cnn_model.build()
    cnn_model.write_tag_result_test(vocab_tags, processing_word,
                                    processing_mor_tag, processing_lex_tag)
Exemple #5
0
def build_data(config):
    """
    Procedure to build data

    Args:
        config: defines attributes needed in the function
    Returns:
        creates vocab files from the datasets
        creates a npz embedding file from trimmed glove vectors
    """
    processing_word = get_processing_word(lowercase=True)

    # Generators
    dev = CoNLLDataset(config.dev_filename, processing_word)
    #test  = CoNLLDataset(config.test_filename, processing_word)
    train = CoNLLDataset(config.train_filename, processing_word)

    # Build Word and Tag vocab
    vocab_words, vocab_tags, vocab_pos = get_vocabs([train, dev])
    vocab_glove = get_glove_vocab(config.glove_filename)
    vocab_glove_uni = get_glove_vocab(config.glove_uni_filename)

    vocab_feature = get_pos_glove_vocab(config.glove_filename)

    # vocab = vocab_words & vocab_glove
    vocab = vocab_glove | vocab_words
    vocab.add(UNK)
    vocab.add(NUM)

    vocab_pos = vocab_feature
    vocab_pos.add(UNK)
    vocab_pos.add(NUM)

    # Save vocab
    write_vocab(vocab, config.words_filename)
    write_vocab(vocab_glove_uni, config.uni_words_filename)
    write_vocab(vocab_tags, config.tags_filename)
    write_vocab(vocab_pos, config.pos_filename)

    # Trim GloVe Vectors
    vocab = load_vocab(config.words_filename)
    export_trimmed_glove_vectors(vocab, config.glove_filename,
                                 config.trimmed_filename, config.t_dim)

    vocab = load_vocab(config.uni_words_filename)

    export_trimmed_uni_vectors(vocab, config.NEdic_filename,
                               config.trimmed_dic, config.dic_dim)

    export_trimmed_uni_vectors(vocab, config.glove_uni_filename,
                               config.uni_trimmed_filename, config.dim)

    vocab_feature = load_vocab(config.pos_filename)
    export_trimmed_pos_vectors(vocab_feature, config.glove_feature,
                               config.feature_trimmed_filename, config.pos_dim)

    # Build and save char vocab
    train = CoNLLDataset(config.train_filename)
    vocab_chars = get_char_vocab(train)
    write_vocab(vocab_chars, config.chars_filename)
def main(config):
    # load vocabs
    vocab_words = load_vocab(config.words_filename)
    vocab_chars = load_vocab(config.chars_filename)

    # get processing functions
    processing_word = get_processing_word(vocab_words,
                                          vocab_chars,
                                          lowercase=True,
                                          chars=True)

    # get pre trained embeddings
    embeddings = get_trimmed_glove_vectors(config.trimmed_filename)

    # create dataset
    dev = AnnotationDataset(config.dev_filename, processing_word)
    test = AnnotationDataset(config.test_filename, processing_word)
    train = AnnotationDataset(config.train_filename, processing_word)

    print("Num. train: %d" % len(train))
    print("Num. test: %d" % len(test))
    print("Num. dev: %d" % len(dev))

    model = WImpModel(config,
                      embeddings,
                      ntags=config.nclass,
                      nchars=len(vocab_chars))

    # build WImpModel
    model.build_graph()

    # train, evaluate and interact
    model.train(train, dev)
    model.evaluate(test)
def chat(question):
    """
    In test mode, we don"t to create the backward path.
    """
    _, enc_vocab = data_utils.load_vocab(
        os.path.join(config.DATA_PATH, "vocab.enc"))
    # `inv_dec_vocab` <type "list">: id2word.
    inv_dec_vocab, _ = data_utils.load_vocab(
        os.path.join(config.DATA_PATH, "vocab.dec"))

    model = ChatBotModel(True, batch_size=1)
    model.build_graph()

    saver = tf.train.Saver()

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        check_restore_parameters(sess, saver)
        output_file = open(os.path.join(config.DATA_PATH,
                                        config.TERMINAL_OUTPUT),
                           "a+",
                           encoding="utf-8")
        # Decode from standard input.
        max_length = config.BUCKETS[-1][0]
        print(
            "Welcome to TensorBro. Say something. Enter to exit. Max length is",
            max_length)

        line = question
        if hasattr(line, "decode"):
            # If using Python 2
            # FIXME: UnicodeError when deleting Chinese in terminal.
            line = line.decode("utf-8")
        if len(line) > 0 and line[-1] == "\n":
            line = line[:-1]
        if not line:
            pass
        output_file.write("HUMAN ++++ " + line + "\n")
        # Get token-ids for the input sentence.
        token_ids = data_utils.sentence2id(enc_vocab, line)
        if len(token_ids) > max_length:
            print("Max length I can handle is:", max_length)
            # line = _get_user_input()
            pass
        # Which bucket does it belong to?
        bucket_id = find_right_bucket(len(token_ids))
        # Get a 1-element batch to feed the sentence to the model.
        encoder_inputs, decoder_inputs, decoder_masks = data_utils.get_batch(
            [(token_ids, [])], bucket_id, batch_size=1)
        # Get output logits for the sentence.
        _, _, output_logits = run_step(sess, model, encoder_inputs,
                                       decoder_inputs, decoder_masks,
                                       bucket_id, True)
        response = construct_response(output_logits, inv_dec_vocab)
        print(response)
        output_file.write("BOT ++++ " + response + "\n")

        output_file.write("=============================================\n")
        output_file.close()
Exemple #8
0
    def __init__(self, config):
        self.config = config
        self.vocab_words = load_vocab(self.config.filename_words)
        self.vocab_tags = load_vocab(self.config.filename_tags)
        self.vocab_chars = load_vocab(self.config.filename_chars)

        # Get pre-trained embeddings
        self.w_embeddings = (get_trimmed_glove_vectors(config.filename_trimmed)
                             if self.config.use_pretrained else None)
Exemple #9
0
def main(config):
    # load vocabs
    vocab_words, idx2words = load_vocab(config.words_filename)
    vocab_tags, _  = load_vocab(config.tags_filename)
    vocab_chars, _ = load_vocab(config.chars_filename)
    vocab_pos, _ = load_vocab(config.pos_filename)


    # get processing functions
    processing_word = get_processing_word(vocab_words, vocab_chars,
                    lowercase=True, chars=config.chars)

    processing_tag  = get_processing_word(vocab_tags, 
                    lowercase=False)

    processing_pos = get_processing_word(vocab_pos,
                                         lowercase=False)




    # get pre trained embeddings
    embeddings = get_trimmed_glove_vectors(config.trimmed_filename)
    embeddings_uni = get_trimmed_glove_vectors(config.uni_trimmed_filename)
    pos_embeddings = get_trimmed_glove_vectors(config.feature_trimmed_filename)
    NE_dic = get_trimmed_glove_vectors(config.trimmed_dic)


    # create dataset
    dev   = CoNLLDataset(config.dev_filename, processing_word,
                        processing_tag, processing_pos, config.max_iter)

    train = CoNLLDataset(config.train_filename, processing_word,
                        processing_tag, processing_pos, config.max_iter)
    
    # build model
    model = NERModel(config, embeddings, embeddings_uni,
                     pos_embeddings, ntags=len(vocab_tags), nchars=len(vocab_chars), vocab_words=idx2words,
                    NE_dic=NE_dic)
    model.build()

    # train, evaluate and interact
    if state == "train":
        model.train(train, dev, vocab_tags)

    elif state == "evaluate":
        model.evaluate(dev, vocab_tags)

    else: #state == predict
        convert(file)
        t2o("data_format/test_convert.txt","data_format/test.txt")
        test = CoNLLDataset(config.test_filename, processing_word,
                            processing_tag, processing_pos, config.max_iter)

        model.evaluate(test, vocab_tags)

        tagging("data_format/test_convert.txt")
Exemple #10
0
class nlu():

    # load vocabs
    vocab_words = load_vocab(config.words_filename)
    vocab_tags = load_vocab(config.tags_filename)

    # get processing functions

    embeddings = get_trimmed_glove_vectors(config.trimmed_filename)

    # get logger
    # logger = get_logger(config.log_path)

    # build model
    model = NERModel(config, embeddings, ntags=len(vocab_tags), logger=None)
    model.build()

    idx_to_tag = {idx: tag for tag, idx in vocab_tags.items()}
    saver = tf.train.Saver()
    sess = tf.Session()
    saver.restore(sess, config.model_output)
    # model.logger.info("This is an interactive mode, enter a sentence:")

    @staticmethod
    def rec(sentence):
        try:

            processing_word = get_processing_word(nlu.vocab_words,
                                                  lowercase=config.lowercase)
            # print character_separation(sentence)[0]

            words_raw = character_separation(sentence)[0].split(' ')
            # for word in words_raw:
            #     if type(word)==str:
            words_raw = [unicode(word, 'utf-8') for word in words_raw]
            # words_raw = [word.decode('utf-8') for word in words_raw]
            # else:
            # words_raw = [unicode(word, 'utf-8') for word in words_raw]

            words = map(processing_word, words_raw)
            words = list(words)
            pred_ids, _ = nlu.model.predict_batch(nlu.sess, [words])
            preds = map(lambda idx: nlu.idx_to_tag[idx], list(pred_ids[0]))
            # print(list(preds))
            print_sentence(nlu.model.logger, {"x": words_raw, "y": preds})
            return list(preds)
        except EOFError:
            print("Closing session.")


# nlu.rec('请播放电视剧三生三世十里桃花')
# nlu.rec('请播放电视剧三生三世十里桃花')
# nlu.rec('请播放电视剧三生三世十里桃花')
Exemple #11
0
def build_data(config):
    """
    Procedure to build data
    Args:
        config: defines attributes needed in the function
    Returns:
        creates vocab files from the datasets
        creates a npz embedding file from trimmed glove vectors
    """
    processing_word = get_processing_word(lowercase=config.lowercase)

    # Generators
    dev = CoNLLDataset(config.dev_filename, processing_word)
    test = CoNLLDataset(config.test_filename, processing_word)
    train = CoNLLDataset(config.train_filename, processing_word)

    # Build Word and Tag vocab
    vocab_words, vocab_tags = get_vocabs([train, dev, test])
    vocab_glove = get_glove_vocab(config.glove_filename)

    vocab = vocab_words & vocab_glove
    vocab.add(UNK)
    vocab.add(NUM)

    # Save vocab
    write_vocab(vocab, config.words_filename)
    write_vocab(vocab_tags, config.tags_filename)

    # Trim GloVe Vectors
    vocab = load_vocab(config.words_filename)
    export_trimmed_glove_vectors(vocab, config.glove_filename,
                                 config.trimmed_filename, config.dim)
Exemple #12
0
    def __init__(self, args):
        super().__init__()

        train_file = args.train_file
        vocab_file = args.vocab_file

        train_sens = data_utils.load_sentences(train_file, skip_invalid=True)
        word2id, id2word, label2id, id2label = data_utils.load_vocab(
            train_sens, vocab_file)

        data_utils.gen_ids(train_sens, word2id, label2id, 100)
        train_full_tensors = data_utils.make_full_tensors(train_sens)

        raw_x = train_full_tensors[0]
        x_length = train_full_tensors[1]
        x_labels = train_full_tensors[2]

        raw_f = lambda t: id2label[t]
        x_labels_true = np.array(list(map(raw_f, x_labels)))

        n_train = int(len(raw_x) * 1)
        self.train_x, self.test_x = raw_x[:n_train], raw_x[n_train:]
        self.train_length_x, self.test_length_x = x_length[:n_train], x_length[
            n_train:]
        self.train_y, self.test_y = x_labels[:n_train], x_labels[n_train:]
        self.gt_label = x_labels_true
        self.raw_q = ["".join(i.raw_tokens) for i in train_sens]
Exemple #13
0
def train():
    checkpoint = "../model/checkpoint/model.ckpt"

    data_utils.prepare()
    index_to_char, char_to_index, vocab_size = data_utils.load_vocab()
    epochs = 100
    with tf.Session() as sess:
        model = attention_seq2seq(vocab_size)
        model.build_model()
        sess.run(tf.global_variables_initializer())

        train_summary = tf.summary.FileWriter('../model/summary/',
                                              graph=sess.graph)
        for epoch in range(1, epochs + 1):
            train_set = data_utils.train_set(char_to_index)
            for source_seq, target_seq in train_set:
                encoder_inputs, encoder_inputs_length, decoder_inputs, decoder_inputs_length = data_utils.prepare_train_batch(
                    source_seq, target_seq)
                _, loss = model.train(
                    sess=sess,
                    encoder_inputs=encoder_inputs,
                    encoder_inputs_length=encoder_inputs_length,
                    decoder_targets=decoder_inputs,
                    decoder_inputs_length=decoder_inputs_length)
                print("epoch={}, loss={}".format(epoch, loss))

            model.merge(sess, train_summary, epoch)

        saver = tf.train.Saver()
        saver.save(sess, save_path=checkpoint)
        print('Model Trained and Saved')
Exemple #14
0
 def load(self):
     self.vocab_tags = load_vocab(self.filename_tags)
     self.processing_tag = get_processing_word(self.vocab_tags,
                                               lowercase=False,
                                               allow_unk=False)
     self.ntags = len(self.vocab_tags)
     self.early_stop_metric_sign = -1 if self.stop_direction == 'increase' else 1
Exemple #15
0
def get_optimal_set(K, vocab_file, corpus_dir, polar_seed_file):
    def get_set(polar_seed, K):
        return polar_seed[:K], polar_seed[-K:]

    if os.path.exists(polar_seed_file):
        polar_seed = []
        for line in open(polar_seed_file):
            ps = line.split('\t')
            polar_seed.append(int(ps[1]))
        return get_set(polar_seed, K)

    vocab2idx, vocab_str, vocab_count = data_utils.load_vocab(vocab_file)
    vocab_size = len(vocab_count)
    for i, c in enumerate(vocab_count):
        if c < 5000:
            vocab_size = i
            break
    vocab_str = vocab_str[:vocab_size]

    corpus = data_utils.load_news_corpus(corpus_dir)
    corpus = process_corpus(corpus)

    polar_seed = get_polar_seed(corpus, vocab_size, vocab2idx)

    print 'Saving polar seed in file %s' % polar_seed_file
    with open(polar_seed_file, 'w') as fout:
        for i, polar in polar_seed:
            fout.write("%s\t%d\t%f\n" % (vocab_str[i], i, polar))

    return get_set(polar_seed, K)
Exemple #16
0
def generate():
    random.seed(SEED)
    np.random.seed(SEED)
    vocab_dict, vocab_res = data_utils.load_vocab('./vocab.txt')
    data = data_utils.load_data('data.pkl')

    vocab_size = len(vocab_dict)
    SEQ_LENGTH = data.shape[1]

    generator = Generator(vocab_size, BATCH_SIZE, EMB_DIM, HIDDEN_DIM,
                          SEQ_LENGTH, START_TOKEN)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    sess.run(tf.global_variables_initializer())

    samples = generator.generate(sess)
    for i in range(int(1)):
        if i > len(samples):
            break
        arr = samples[i]
        poem = ''
        for index in arr:
            if index != data_utils.EOS_ID:
                poem += vocab_res[index]
        print(poem)
Exemple #17
0
    def train(self,
              epoch=25,
              batch_size=1,
              learning_rate=0.0002,
              momentum=0.9,
              decay=0.95,
              data_dir="data",
              dataset_name="cnn",
              vocab_size=1000000):
        if not self.vocab:
            self.vocab, self.rev_vocab = load_vocab(data_dir, dataset_name,
                                                    vocab_size)

        self.opt = tf.train.RMSPropOptimizer(learning_rate,
                                             decay=decay,
                                             momentum=momentum)

        for epoch_idx in xrange(epoch):
            data_loader = load_dataset(data_dir, dataset_name, vocab_size)

            contexts, questions, answers = [], [], []
            for batch_idx in xrange(batch_size):
                _, context, question, answer, _ = data_loader.next()
                contexts.append(context)
                questions.append(question)
                answers.append(answers)
Exemple #18
0
def build_joint_vocab(config):

    # Common options for all datasets
    processing_word = get_processing_word(lowercase=True)
    vocab_glove = get_glove_vocab(config.filename_glove)

    # Compute and save individual vocab
    v1_words, v1_chars = get_conll2005_vocab(config.conll2005, processing_word,
                                             vocab_glove)
    v2_words, v2_chars = get_conll2003_vocab(config.conll2003, processing_word,
                                             vocab_glove)
    v3_words, v3_chars = get_semcor_vocab(config.semcor, processing_word,
                                          vocab_glove)

    print(" *** Joint vocabulary ***")
    vocab_words = v1_words.union(v2_words, v3_words)
    vocab_chars = v1_chars.union(v2_chars, v3_chars)

    # Save combined vocab
    write_vocab(vocab_words, config.filename_words)
    write_vocab(vocab_chars, config.filename_chars)

    # Trim GloVe Vectors
    vocab = load_vocab(config.filename_words)
    export_trimmed_glove_vectors(vocab, config.filename_glove,
                                 config.filename_trimmed, config.dim_word)
Exemple #19
0
def predit():
    vocab_path = "../vocab.pickle"

    input_sentence = "不是"
    index_to_char, char_to_index, vocab_size = data_utils.load_vocab(
        vocab_path)
    form_input = []
    for ch in input_sentence:
        try:
            ch = char_to_index[ch]
            form_input.append(ch)
        except KeyError:
            pass
    encoder_inputs, encoder_inputs_length = data_utils.prepare_predict_batch(
        [form_input])
    checkpoint = "../model/checkpoint/model.ckpt-1"

    with tf.Session() as sess:
        model = attention_seq2seq(vocab_size=vocab_size, mode='decode')
        model.build_model()
        saver = tf.train.Saver()
        saver.restore(sess=sess, save_path=checkpoint)
        predicted_ids = model.predict(
            sess=sess,
            encoder_inputs=encoder_inputs,
            encoder_inputs_length=encoder_inputs_length)
        predicted_ids = predicted_ids[0].tolist()
        predicted_ids = predicted_ids[0]
        print(predicted_ids)
        temp = [
            index_to_char[i] for i in predicted_ids
            if i != data_utils.end_token
        ]
        print(temp)
        print("".join(temp))
Exemple #20
0
def save_polar_optimal(K, vocab_file, corpus_dir, polar_seed_file,
                       polar_optimal_file):
    optimal_p, optimal_n = get_optimal_set(K, vocab_file, corpus_dir,
                                           polar_seed_file)

    corpus = data_utils.load_news_corpus(corpus_dir)
    corpus = process_corpus(corpus)

    vocab2idx, vocab_str, vocab_count = data_utils.load_vocab(vocab_file)
    vocab_size = len(vocab_count)
    for i, c in enumerate(vocab_count):
        if c < 500:
            vocab_size = i
            break
    vocab_str = vocab_str[:vocab_size]

    p_w = get_Pw(corpus, vocab_size)
    print 'Calculating porlar_optimal...'
    porlar_optimal = [0. for _ in xrange(vocab_size)]
    ppw, npw = get_set_Pw(corpus, optimal_p, optimal_n)

    for wi in tqdm(xrange(len(p_w))):
        porlar_optimal[wi] = polar(corpus, optimal_p, optimal_n, ppw, npw, wi,
                                   p_w[wi])

    porlar_optimal = sorted(enumerate(porlar_optimal),
                            key=lambda x: x[1],
                            reverse=True)

    print 'Saving polar optimal in file %s' % polar_optimal_file
    with open(polar_optimal_file, 'w') as fout:
        for i, p in porlar_optimal:
            fout.write("%s\t%d\t%f\n" % (vocab_str[i], i, p))
Exemple #21
0
def build_data(config):
    processing_word = get_processing_word()

    dev = CoNLLDataset(config.dev_filename, processing_word)
    test = CoNLLDataset(config.test_filename, processing_word)
    train = CoNLLDataset(config.train_filename, processing_word)

    vocab_words, vocab_tags, vocab_poss = get_vocabs([train, dev, test])
    vocab_glove = get_glove_vocab(config.glove_filename)

    vocab = vocab_words & vocab_glove
    vocab.add(UNK)
    vocab.add(NUM)

    write_vocab(vocab, config.words_filename)
    write_vocab(vocab_tags, config.tags_filename)
    write_vocab(vocab_poss, config.poss_filename)

    vocab = load_vocab(config.words_filename)
    export_trimmed_glove_vectors(vocab, config.glove_filename,
                                 config.trimmed_filename, config.dim)

    train = CoNLLDataset(config.train_filename)
    vocab_chars = get_char_vocab(train)
    write_vocab(vocab_chars, config.chars_filename)
Exemple #22
0
def main(config):
    # load vocabs
    vocab_words = load_vocab(config.words_filename)
    vocab_tags = load_vocab(config.tags_filename)
    vocab_chars = load_vocab(config.chars_filename)
    vocab_iob = {"O": 0, "B": 1, "I": 2}
    vocab_type = {"LOC": 0, "PER": 1, "ORG": 2, "MISC": 3}

    # get processing functions
    processing_word = get_processing_word(vocab_words,
                                          vocab_chars,
                                          lowercase=True,
                                          chars=config.chars)
    processing_tag = get_processing_word(vocab_tags, lowercase=False)
    processing_iob = get_processing_word(vocab_iob, lowercase=False)
    processing_type = get_processing_word(vocab_type, lowercase=False)

    # get pre trained embeddings
    embeddings = get_trimmed_glove_vectors(config.trimmed_filename)

    # create dataset
    dev = CoNLLDataset(config.dev_filename, processing_word, processing_tag,
                       processing_iob, processing_type, config.max_iter,
                       config.chars)
    test = CoNLLDataset(config.test_filename, processing_word, processing_tag,
                        processing_iob, processing_type, config.max_iter,
                        config.chars)
    train = CoNLLDataset(config.train_filename, processing_word,
                         processing_tag, processing_iob, processing_type,
                         config.max_iter, config.chars)

    model = NERModel(config,
                     embeddings,
                     ntags=len(vocab_tags),
                     nchars=len(vocab_chars),
                     niob=3,
                     ntype=4)

    model.build()

    # train, evaluate and interact
    print vocab_tags
    model.train(train, dev, vocab_tags)

    stime = time.time()
    model.evaluate(test, vocab_tags)
    print time.time() - stime
Exemple #23
0
def chat():
    """ in test mode, we don't to create the backward path """
    _, enc_vocab = data_utils.load_vocab(
        os.path.join(config.PROCESSED_PATH, 'vocab.enc'))
    inv_dec_vocab, _ = data_utils.load_vocab(
        os.path.join(config.PROCESSED_PATH, 'vocab.dec'))

    model = ChatBotModel(True, batch_size=1)
    model.build_graph()

    saver = tf.train.Saver()

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        check_restore_parameters(sess, saver)
        output_file = open(
            '/Users/EleanorLeung/Documents/CITS4404/chatbot/output_convo.txt',
            'a+')
        # Decode from standard input.
        max_length = config.BUCKETS[-1][0]
        print('Talk to me! Enter to exit. Max length is', max_length)
        while True:
            line = str.encode(get_user_input())
            if len(line) > 0 and line[-1] == '\n':
                line = line[:-1]
            if line == '':
                break
            output_file.write('HUMAN: ' + str(line) + '\n')
            token_ids = data_utils.sentence2id(enc_vocab, line)
            if len(token_ids) > max_length:
                print('Max length I can handle is:', max_length)
                line = get_user_input()
                continue
            bucket_id = find_right_bucket(len(token_ids))
            # Get a 1-element batch to feed the sentence to the model.
            encoder_inputs, decoder_inputs, decoder_masks = data_utils.get_batch(
                [(token_ids, [])], bucket_id, batch_size=1)
            # Get output logits for the sentence.
            _, _, output_logits = run_step(sess, model, encoder_inputs,
                                           decoder_inputs, decoder_masks,
                                           bucket_id, True)
            response = construct_response(output_logits, inv_dec_vocab)
            print(response)
            output_file.write('BOT: ' + response + '\n')
        output_file.write('=============================================\n')
        output_file.close()
Exemple #24
0
def build_data(config, logger):
    """
    Procedure to build data
    """
    processing_word = get_processing_word(lowercase=config.lowercase)

    # Generators
    test = CoNLLDataset(config.test_filename, processing_word)
    dev = CoNLLDataset(config.dev_filename, processing_word)
    train = CoNLLDataset(config.train_filename, processing_word)

    # Build Word and Tag vocab
    print("Build Word and Tag vocab...")
    vocab_words, vocab_poss, vocab_chunks, \
    vocab_aspect_tags, vocab_polarity_tags, vocab_joint_tags = get_vocabs([train, dev, test])
    vocab = vocab_words
    vocab.add(UNK)
    vocab.add(NUM)

    # Save vocab
    print("Dealing words vocab...")
    write_vocab(vocab, config.words_filename)
    print("Dealing poss vocab...")
    write_vocab(vocab_poss, config.poss_filename)

    vocab_chunks = [tags for tags in vocab_chunks]
    if "NO" in vocab_chunks:
        vocab_chunks.remove("NO")
        vocab_chunks.insert(0, "NO")
    else:
        logger.error(">>> vocab_chunks used as mpqa has something wrong!")
    print("Dealing chunks vocab...")
    write_vocab(vocab_chunks, config.chunks_filename)

    vocab_aspect_tags = [tags for tags in vocab_aspect_tags]
    vocab_aspect_tags.remove("O")
    vocab_aspect_tags.insert(0, "O")
    vocab_polarity_tags = [tags for tags in vocab_polarity_tags]
    vocab_polarity_tags.remove("O")
    vocab_polarity_tags.insert(0, "O")
    vocab_joint_tags = [tags for tags in vocab_joint_tags]
    vocab_joint_tags.remove("O")
    vocab_joint_tags.insert(0, "O")
    print("Dealing aspect_tags vocab...")
    write_vocab(vocab_aspect_tags, config.aspect_tags_filename)
    print("Dealing polarity_tags vocab...")
    write_vocab(vocab_polarity_tags, config.polarity_tags_filename)
    print("Dealing joint_tags vocab...")
    write_vocab(vocab_joint_tags, config.joint_tags_filename)

    vocab = load_vocab(config.words_filename)
    export_trimmed_glove_vectors(vocab, config.domain_filename,
                                 config.domain_trimmed_filename,
                                 config.dim_domain)
    export_trimmed_glove_vectors(vocab, config.general_filename,
                                 config.general_trimmed_filename,
                                 config.dim_general)
def demo():
    with tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
        log_device_placement=FLAGS.log_device_placement)) as sess:
        # Create model and load parameters.
        model, _ = create_model(sess, forward_only=True)

        nl_vocab, _, _, rev_cm_vocab = data_utils.load_vocab(FLAGS)

        decode_tools.demo(sess, model, nl_vocab, rev_cm_vocab, FLAGS)
def manual_eval(num_eval):
    with tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
        log_device_placement=FLAGS.log_device_placement)) as sess:
        # Create model and load parameters.
        _, model_sig = graph_utils.get_model_signature(FLAGS)
        _, rev_nl_vocab, _, rev_cm_vocab = data_utils.load_vocab(FLAGS)
        _, dev_set, _ = load_data(use_buckets=False)

        eval_tools.manual_eval(model_sig, dev_set, rev_nl_vocab, FLAGS,
                               FLAGS.model_dir, num_eval)
def eval(data_set, model_sig=None, verbose=True):
    with tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
        log_device_placement=FLAGS.log_device_placement)) as sess:
        if model_sig is None:
            _, model_sig = graph_utils.get_model_signature(FLAGS)
        print("evaluate " + model_sig + "...")
        _, rev_nl_vocab, _, rev_cm_vocab = data_utils.load_vocab(FLAGS)

        return eval_tools.eval_set(model_sig, data_set, rev_nl_vocab, FLAGS,
                                   verbose=verbose)
Exemple #28
0
    def load(self):
        """Loads vocabulary, processing functions and embeddings
        """
        # 1. vocabulary
        self.vocab_words = load_vocab(self.filename_words)
        self.vocab_chars = load_vocab(self.filename_chars)

        self.nwords = len(self.vocab_words)
        self.nchars = len(self.vocab_chars)

        # 2. get processing functions that map str -> id
        self.processing_word = get_processing_word(self.vocab_words,
                                                   self.vocab_chars,
                                                   lowercase=True,
                                                   chars=self.use_chars)

        # 3. get pre-trained embeddings
        self.embeddings = (get_trimmed_glove_vectors(self.filename_trimmed)
                           if self.use_pretrained else None)
def main(config):
    # load vocabs
    vocab_words = load_vocab(config.words_filename)
    vocab_tags  = load_vocab(config.tags_filename)
    vocab_chars = load_vocab(config.chars_filename)
    dictionary = load_vocab("data/types.txt")
    types_dic = collections.OrderedDict([(v, k) for k, v in dictionary.items()])
    vocab_iob = {"O":0, "B":1, "I":2}
    vocab_type = load_vocab(config.types_filename)
    print vocab_type
    # get processing functions
    processing_word = get_processing_word(vocab_words, vocab_chars,
                    lowercase=True, chars=config.chars)
    processing_tag  = get_processing_word(vocab_tags, 
                    lowercase=False)
    processing_iob = get_processing_word(vocab_iob, 
                    lowercase=False)
    processing_type = get_processing_word(vocab_type, 
                    lowercase=False)

    # get pre trained embeddings
    embeddings = get_trimmed_glove_vectors(config.trimmed_filename)

    # create dataset
    dev   = CoNLLDataset(config.dev_filename, processing_word,
                        processing_tag, processing_iob, processing_type, config.max_iter, config.chars)
    test  = CoNLLDataset(config.test_filename, processing_word,
                        processing_tag, processing_iob, processing_type, config.max_iter, config.chars)
    train = CoNLLDataset(config.train_filename, processing_word,
                        processing_tag, processing_iob, processing_type, config.max_iter, config.chars)

    ntype = len(vocab_type)
    model = POSmodel(config, embeddings, ntags=len(vocab_tags),
                                         nchars=len(vocab_chars),
                                         niob=3,
                                         ntype=ntype)

    model.build()

    model.train(train, dev, vocab_type)

    model.evaluate(test, vocab_type)
Exemple #30
0
def check_npz():
    vocab = load_vocab("../data/words.txt")
    idx = vocab['硕士']

    with open('../data/polyglot-zh.pkl', 'rb') as f:
        words, embeddings = pickle.load(f, encoding="latin1")
        words = list(words)
        embeddings = list(embeddings)
    word_idx = words.index('硕士')

    return (data[idx] == embeddings[word_idx])
Exemple #31
0
def dopredict():
    """
    给定一个文件或一句话,预测结果
    :return:
    """
    parser = argparse.ArgumentParser(description='Text CNN 分类器')
    #必须指定已经训练好的模型
    parser.add_argument('--path',
                        type=str,
                        default="data/predict/",
                        help='要进行预测的文本文件的路径,或文件夹')
    parser.add_argument('--model',
                        type=str,
                        default="model/textcnn.model",
                        help='读取model进行预测')
    conf = Config()
    args = parser.parse_args()
    #指定Field格式
    text_field = data_utils.TextTEXT
    label_field = data_utils.TextLABEL
    text_field.vocab = data_utils.load_vocab("model/text.vocab")
    label_field.vocab = data_utils.load_vocab("model/label.vocab")
    # 模型加载和初始化
    if os.path.exists(args.model):
        print('发现模型文件, 加载模型: {}'.format(args.model))
        cnn = torch.load(args.model)
    else:
        print("未找到模型文件,退出")
        sys.exit(-1)
    #如果是文件夹,那么预测里面的文件,否则就是文件,直接预测
    if os.path.isdir(args.path):
        files = os.listdir(args.path)
        files_path = [args.path + f for f in files]
    else:
        files_path = [args.path]
    #开始预测
    for file in files_path:
        text, label = model_utils.predict(file, cnn, text_field, label_field,
                                          conf.cuda)
        print('[path]  {}\n[Text]  {}\n[Label] {}\n'.format(file, text, label))
    print(f'共预测{len(files_path)}个文件')
def decode(data_set, construct_model_dir=True, verbose=True):
    with tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
        log_device_placement=FLAGS.log_device_placement)) as sess:
        # Create model and load parameters.
        model, _ = create_model(sess, forward_only=True,
                                construct_model_dir=construct_model_dir)

        _, rev_nl_vocab, _, rev_cm_vocab = data_utils.load_vocab(FLAGS)

        decode_tools.decode_set(sess, model, data_set,
                                rev_nl_vocab, rev_cm_vocab, FLAGS, verbose)
        return model.model_sig
Exemple #33
0
def build_data(config):
    """
    Procedure to build data

    Args:
        config: defines attributes needed in the function
    Returns:
        creates vocab files from the datasets
        creates a npz embedding file from trimmed glove vectors
    """
    processing_word = get_processing_word(lowercase=True)
    processing_word = get_processing_word(lowercase=True)

    # clean data
    train_filepath, dev_filepath_a = write_clear_data(
        config.train_filename,
        build_dev=config.build_dev_from_trainset,
        dev_ratio=config.dev_ratio)
    test_filepath, dev_filepath_b = write_clear_data(
        config.test_filename,
        build_dev=config.build_dev_from_testset,
        dev_ratio=config.dev_ratio)
    dev_filepath = dev_filepath_a or dev_filepath_b

    # Generators
    dev = Dataset(dev_filepath, processing_word)
    test = Dataset(test_filepath, processing_word)
    train = Dataset(train_filepath, processing_word)

    # Build Word and Tag vocab
    vocab_words, vocab_tags = get_vocabs([train, dev, test])
    vocab_glove = get_glove_vocab(config.glove_filename)

    vocab = vocab_words & vocab_glove
    vocab.add(UNK)
    vocab.add(NUM)

    # Save vocab
    write_vocab(vocab, config.words_filename)
    write_vocab(vocab_tags, config.tags_filename)

    # Trim GloVe Vectors
    vocab = load_vocab(config.words_filename)
    export_trimmed_glove_vectors(vocab, config.glove_filename,
                                 config.trimmed_filename, config.dim)

    # Build and save char vocab
    train = Dataset(train_filepath)
    vocab_chars = get_char_vocab(train)
    write_vocab(vocab_chars, config.chars_filename)
Exemple #34
0
 def __init__(self, root_dir, config, debug=True):
     self.config = config
     self.test_set = tf.gfile.Glob(join(root_dir, "test", "*.question"))
     if debug:
         self.validation_set = tf.gfile.Glob(
             join(root_dir, "validation", "*.question"))
     else:
         self.training_set = tf.gfile.Glob(
             join(root_dir, "training", "*.question"))
     self.vocabulary, self.reverse_vocabulary = load_vocab(
         root_dir, str(Config.vocab_size - 2))
     self.reverse_vocabulary = ['BAR_', 'UNK_'] + self.reverse_vocabulary
     self.pool = Pool(4)
     self.debug = debug
    def prepare_model(self, data_dir, dataset_name, vocab_size):
        if not self.vocab:
            self.vocab, self.rev_vocab = load_vocab(data_dir, dataset_name, vocab_size)
            print(" [*] Loading vocab finished.")

        self.vocab_size = len(self.vocab)

        self.emb = tf.get_variable("emb", [self.vocab_size, self.size])

        # inputs
        self.inputs = tf.placeholder(tf.int32, [self.batch_size, self.max_nsteps])
        embed_inputs = tf.nn.embedding_lookup(self.emb, tf.transpose(self.inputs))

        tf.histogram_summary("embed", self.emb)

        # output states
        _, states = rnn.rnn(
            self.stacked_cell, tf.unpack(embed_inputs), dtype=tf.float32, initial_state=self.initial_state
        )
        self.batch_states = tf.pack(states)

        self.nstarts = tf.placeholder(tf.int32, [self.batch_size, 3])
        outputs = tf.pack(
            [
                tf.slice(self.batch_states, nstarts, [1, 1, self.output_size])
                for idx, nstarts in enumerate(tf.unpack(self.nstarts))
            ]
        )

        self.outputs = tf.reshape(outputs, [self.batch_size, self.output_size])

        self.W = tf.get_variable("W", [self.vocab_size, self.output_size])
        tf.histogram_summary("weights", self.W)
        tf.histogram_summary("output", outputs)

        self.y = tf.placeholder(tf.float32, [self.batch_size, self.vocab_size])
        self.y_ = tf.matmul(self.outputs, self.W, transpose_b=True)

        self.loss = tf.nn.softmax_cross_entropy_with_logits(self.y_, self.y)
        tf.scalar_summary("loss", tf.reduce_mean(self.loss))

        correct_prediction = tf.equal(tf.argmax(self.y, 1), tf.argmax(self.y_, 1))
        self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
        tf.scalar_summary("accuracy", self.accuracy)

        print(" [*] Preparing model finished.")
  def train(self, epoch=25, batch_size=1,
            learning_rate=0.0002, momentum=0.9, decay=0.95,
            data_dir="data", dataset_name="cnn", vocab_size=1000000):
    if not self.vocab:
      self.vocab, self.rev_vocab = load_vocab(data_dir, dataset_name, vocab_size)

    self.opt = tf.train.RMSPropOptimizer(learning_rate,
                                         decay=decay,
                                         momentum=momentum)

    for epoch_idx in xrange(epoch):
      data_loader = load_dataset(data_dir, dataset_name, vocab_size)

      contexts, questions, answers = [], [], []
      for batch_idx in xrange(batch_size):
        _, context, question, answer, _ = data_loader.next()
        contexts.append(context)
        questions.append(question)
        answers.append(answers)
def build_data(config):
    """
    Procedure to build data

    Args:
        config: defines attributes needed in the function
    Returns:
        creates vocab files from the datasets
        creates a npz embedding file from trimmed glove vectors
    """
    processing_word = get_processing_word(lowercase=config.lowercase)

    # Generators
    dev   = CoNLLDataset(config.dev_filename, processing_word)
    test  = CoNLLDataset(config.test_filename, processing_word)
    train = CoNLLDataset(config.train_filename, processing_word)

    # Build Word and Tag vocab
    vocab_words, vocab_tags = get_vocabs([train, dev, test])
    vocab_glove = get_glove_vocab(config.glove_filename)

    vocab = vocab_words & vocab_glove
    vocab.add(UNK)
    vocab.add(NUM)

    # Save vocab
    write_vocab(vocab, config.words_filename)
    write_vocab(vocab_tags, config.tags_filename)

    # Trim GloVe Vectors
    vocab = load_vocab(config.words_filename)
    export_trimmed_glove_vectors(vocab, config.glove_filename, 
                                config.trimmed_filename, config.dim)

    # Build and save char vocab
    train = CoNLLDataset(config.train_filename)
    vocab_chars = get_char_vocab(train)
    write_vocab(vocab_chars, config.chars_filename)
Exemple #38
0
from data_utils import get_trimmed_glove_vectors, load_vocab, \
    get_processing_word, CoNLLDataset
from model import NERModel
from config import Config

# create instance of config
config = Config()

# load vocabs
vocab_words = load_vocab(config.words_filename)
vocab_tags  = load_vocab(config.tags_filename)
vocab_chars = load_vocab(config.chars_filename)

# get processing functions
processing_word = get_processing_word(vocab_words, vocab_chars,
                lowercase=True, chars=config.chars)
processing_tag  = get_processing_word(vocab_tags, 
                lowercase=False)

# get pre trained embeddings
embeddings = get_trimmed_glove_vectors(config.trimmed_filename)

# create dataset
dev   = CoNLLDataset(config.dev_filename, processing_word,
                    processing_tag, config.max_iter)
test  = CoNLLDataset(config.test_filename, processing_word,
                    processing_tag, config.max_iter)
train = CoNLLDataset(config.train_filename, processing_word,
                    processing_tag, config.max_iter)

# build model
Exemple #39
0
vocab_words = set()
vocab_tags = set()
vocab_chars = set()
file = open('data/all.txt')
for line in file:
	line = line.strip()
	if len(line) == 0:
		continue
	token, tag = line.split(' ')
	print token, tag
	for c in token:
		vocab_chars.add(c)
	vocab_words.add(token)
	vocab_tags.add(tag)

# Build Word and Tag vocab
vocab_glove = get_glove_vocab(config.glove_filename)

vocab = vocab_words & vocab_glove
vocab.add(UNK)
vocab.add(NUM)

# Save vocabs
write_vocab(vocab, config.words_filename)
write_vocab(vocab_tags, config.tags_filename)
write_vocab(vocab_chars, config.chars_filename)

# Trim GloVe Vectors
vocab = load_vocab(config.words_filename)
export_trimmed_glove_vectors(vocab, config.glove_filename, config.trimmed_filename, config.dim)