Exemple #1
0
def main(config):
    # load vocabs
    vocab_words, idx2words = load_vocab(config.words_filename)
    vocab_tags, _  = load_vocab(config.tags_filename)
    vocab_chars, _ = load_vocab(config.chars_filename)
    vocab_pos, _ = load_vocab(config.pos_filename)


    # get processing functions
    processing_word = get_processing_word(vocab_words, vocab_chars,
                    lowercase=True, chars=config.chars)

    processing_tag  = get_processing_word(vocab_tags, 
                    lowercase=False)

    processing_pos = get_processing_word(vocab_pos,
                                         lowercase=False)




    # get pre trained embeddings
    embeddings = get_trimmed_glove_vectors(config.trimmed_filename)
    embeddings_uni = get_trimmed_glove_vectors(config.uni_trimmed_filename)
    pos_embeddings = get_trimmed_glove_vectors(config.feature_trimmed_filename)
    NE_dic = get_trimmed_glove_vectors(config.trimmed_dic)


    # create dataset
    dev   = CoNLLDataset(config.dev_filename, processing_word,
                        processing_tag, processing_pos, config.max_iter)

    train = CoNLLDataset(config.train_filename, processing_word,
                        processing_tag, processing_pos, config.max_iter)
    
    # build model
    model = NERModel(config, embeddings, embeddings_uni,
                     pos_embeddings, ntags=len(vocab_tags), nchars=len(vocab_chars), vocab_words=idx2words,
                    NE_dic=NE_dic)
    model.build()

    # train, evaluate and interact
    if state == "train":
        model.train(train, dev, vocab_tags)

    elif state == "evaluate":
        model.evaluate(dev, vocab_tags)

    else: #state == predict
        convert(file)
        t2o("data_format/test_convert.txt","data_format/test.txt")
        test = CoNLLDataset(config.test_filename, processing_word,
                            processing_tag, processing_pos, config.max_iter)

        model.evaluate(test, vocab_tags)

        tagging("data_format/test_convert.txt")
def main(config):
    # load vocabs
    vocab_words = load_vocab(config.words_filename)
    vocab_mor_tags = load_vocab(config.mor_tags_filename)
    vocab_tags = load_vocab(config.tags_filename)
    vocab_chars = load_vocab(config.chars_filename)
    vocab_lex_tags = load_vocab(config.lex_tags_filename)

    # get processing functions
    processing_word = get_processing_word(vocab_words,
                                          vocab_chars,
                                          lowercase=True,
                                          chars=config.chars)
    processing_mor_tag = get_processing_word(vocab_mor_tags, lowercase=False)
    processing_tag = get_processing_word(vocab_tags, lowercase=False)
    processing_lex_tag = get_processing_word(vocab_lex_tags, lowercase=False)

    # get pre trained embeddings
    embeddings = get_trimmed_glove_vectors(config.trimmed_filename)

    cnn_model = CnnLstmCrfModel(config,
                                embeddings,
                                ntags=len(vocab_tags),
                                nchars=len(vocab_chars))
    cnn_model.build()
    cnn_model.write_tag_result_test(vocab_tags, processing_word,
                                    processing_mor_tag, processing_lex_tag)
Exemple #3
0
def main(config):
    # load vocabs
    vocab_words = load_vocab(config.words_filename)
    vocab_mor_tags = load_vocab(config.mor_tags_filename)
    vocab_tags = load_vocab(config.tags_filename)
    vocab_chars = load_vocab(config.chars_filename)
    vocab_lex_tags = load_vocab(config.lex_tags_filename)

    # get processing functions
    processing_word = get_processing_word(vocab_words,
                                          vocab_chars,
                                          lowercase=True,
                                          chars=config.chars)
    processing_mor_tag = get_processing_word(vocab_mor_tags, lowercase=False)
    processing_tag = get_processing_word(vocab_tags, lowercase=False)
    processing_lex_tag = get_processing_word(vocab_lex_tags, lowercase=False)

    # get pre trained embeddings
    embeddings = get_trimmed_glove_vectors(config.trimmed_filename)

    # create dataset
    dev = Data(config.dev_filename, processing_word, processing_mor_tag,
               processing_lex_tag, processing_tag, config.max_iter)
    test = Data(config.test_filename, processing_word, processing_mor_tag,
                processing_lex_tag, processing_tag, config.max_iter)
    train = Data(config.train_filename, processing_word, processing_mor_tag,
                 processing_lex_tag, processing_tag, config.max_iter)

    cnn_model = CnnLstmCrfModel(config,
                                embeddings,
                                ntags=len(vocab_tags),
                                nchars=len(vocab_chars))
    cnn_model.build()
    cnn_model.train(train, dev, vocab_tags)
    cnn_model.evaluate(test, vocab_tags)
def main(config):
    # load vocabs
    vocab_words = load_vocab(config.words_filename)
    vocab_chars = load_vocab(config.chars_filename)

    # get processing functions
    processing_word = get_processing_word(vocab_words,
                                          vocab_chars,
                                          lowercase=True,
                                          chars=True)

    # get pre trained embeddings
    embeddings = get_trimmed_glove_vectors(config.trimmed_filename)

    # create dataset
    dev = AnnotationDataset(config.dev_filename, processing_word)
    test = AnnotationDataset(config.test_filename, processing_word)
    train = AnnotationDataset(config.train_filename, processing_word)

    print("Num. train: %d" % len(train))
    print("Num. test: %d" % len(test))
    print("Num. dev: %d" % len(dev))

    model = WImpModel(config,
                      embeddings,
                      ntags=config.nclass,
                      nchars=len(vocab_chars))

    # build WImpModel
    model.build_graph()

    # train, evaluate and interact
    model.train(train, dev)
    model.evaluate(test)
def main(config):
    # load vocabs
    vocab_words = load_vocab(config.words_filename)
    vocab_tags  = load_vocab(config.tags_filename)
    vocab_chars = load_vocab(config.chars_filename)

    # get processing functions
    processing_word = get_processing_word(vocab_words, vocab_chars,
                    lowercase=True, chars=config.chars)
    processing_tag  = get_processing_word(vocab_tags, 
                    lowercase=False, allow_unk=False)

    # get pre trained embeddings
    embeddings = get_trimmed_glove_vectors(config.trimmed_filename)

    # create dataset
    dev   = CoNLLDataset(config.dev_filename, processing_word,
                        processing_tag, config.max_iter)
    test  = CoNLLDataset(config.test_filename, processing_word,
                        processing_tag, config.max_iter)
    train = CoNLLDataset(config.train_filename, processing_word,
                        processing_tag, config.max_iter)

    # build model
    model = NERModel(config, embeddings, ntags=len(vocab_tags),
                                         nchars=len(vocab_chars))
    model.build()

    # train, evaluate and interact
    model.train(train, dev, vocab_tags)
    model.evaluate(test, vocab_tags)
    model.interactive_shell(vocab_tags, processing_word)
Exemple #6
0
    def __init__(self, config):
        self.config = config
        self.vocab_words = load_vocab(self.config.filename_words)
        self.vocab_tags = load_vocab(self.config.filename_tags)
        self.vocab_chars = load_vocab(self.config.filename_chars)

        # Get pre-trained embeddings
        self.w_embeddings = (get_trimmed_glove_vectors(config.filename_trimmed)
                             if self.config.use_pretrained else None)
Exemple #7
0
class nlu():

    # load vocabs
    vocab_words = load_vocab(config.words_filename)
    vocab_tags = load_vocab(config.tags_filename)

    # get processing functions

    embeddings = get_trimmed_glove_vectors(config.trimmed_filename)

    # get logger
    # logger = get_logger(config.log_path)

    # build model
    model = NERModel(config, embeddings, ntags=len(vocab_tags), logger=None)
    model.build()

    idx_to_tag = {idx: tag for tag, idx in vocab_tags.items()}
    saver = tf.train.Saver()
    sess = tf.Session()
    saver.restore(sess, config.model_output)
    # model.logger.info("This is an interactive mode, enter a sentence:")

    @staticmethod
    def rec(sentence):
        try:

            processing_word = get_processing_word(nlu.vocab_words,
                                                  lowercase=config.lowercase)
            # print character_separation(sentence)[0]

            words_raw = character_separation(sentence)[0].split(' ')
            # for word in words_raw:
            #     if type(word)==str:
            words_raw = [unicode(word, 'utf-8') for word in words_raw]
            # words_raw = [word.decode('utf-8') for word in words_raw]
            # else:
            # words_raw = [unicode(word, 'utf-8') for word in words_raw]

            words = map(processing_word, words_raw)
            words = list(words)
            pred_ids, _ = nlu.model.predict_batch(nlu.sess, [words])
            preds = map(lambda idx: nlu.idx_to_tag[idx], list(pred_ids[0]))
            # print(list(preds))
            print_sentence(nlu.model.logger, {"x": words_raw, "y": preds})
            return list(preds)
        except EOFError:
            print("Closing session.")


# nlu.rec('请播放电视剧三生三世十里桃花')
# nlu.rec('请播放电视剧三生三世十里桃花')
# nlu.rec('请播放电视剧三生三世十里桃花')
Exemple #8
0
def main(config):
    # load vocabs
    vocab_words = load_vocab(config.words_filename)
    vocab_tags = load_vocab(config.tags_filename)
    vocab_chars = load_vocab(config.chars_filename)
    vocab_iob = {"O": 0, "B": 1, "I": 2}
    vocab_type = {"LOC": 0, "PER": 1, "ORG": 2, "MISC": 3}

    # get processing functions
    processing_word = get_processing_word(vocab_words,
                                          vocab_chars,
                                          lowercase=True,
                                          chars=config.chars)
    processing_tag = get_processing_word(vocab_tags, lowercase=False)
    processing_iob = get_processing_word(vocab_iob, lowercase=False)
    processing_type = get_processing_word(vocab_type, lowercase=False)

    # get pre trained embeddings
    embeddings = get_trimmed_glove_vectors(config.trimmed_filename)

    # create dataset
    dev = CoNLLDataset(config.dev_filename, processing_word, processing_tag,
                       processing_iob, processing_type, config.max_iter,
                       config.chars)
    test = CoNLLDataset(config.test_filename, processing_word, processing_tag,
                        processing_iob, processing_type, config.max_iter,
                        config.chars)
    train = CoNLLDataset(config.train_filename, processing_word,
                         processing_tag, processing_iob, processing_type,
                         config.max_iter, config.chars)

    model = NERModel(config,
                     embeddings,
                     ntags=len(vocab_tags),
                     nchars=len(vocab_chars),
                     niob=3,
                     ntype=4)

    model.build()

    # train, evaluate and interact
    print vocab_tags
    model.train(train, dev, vocab_tags)

    stime = time.time()
    model.evaluate(test, vocab_tags)
    print time.time() - stime
Exemple #9
0
def main(config):
    # load vocabs
    vocab_words = load_vocab(config.words_filename)
    vocab_tags = load_vocab(config.tags_filename)
    vocab_chars = load_vocab(config.chars_filename)
    vocab_pref_suff = load_vocab(
        config.PS_filename)  ############### For prefix and suffix
    vocab_pref_suff_2 = load_vocab(config.PS_filename_2)
    vocab_pref_suff_4 = load_vocab(config.PS_filename_4)
    # get processing functions
    processing_word = get_processing_word(vocab_words,
                                          vocab_chars,
                                          vocab_pref_suff,
                                          vocab_pref_suff_2,
                                          vocab_pref_suff_4,
                                          lowercase=True,
                                          chars=config.chars,
                                          Pref_Suff=config.pref_suff)
    processing_tag = get_processing_word(vocab_tags,
                                         lowercase=False,
                                         Geoparser=True)

    # get pre trained embeddings
    embeddings = get_trimmed_glove_vectors(config.trimmed_filename)

    ##create dataset
    dev = CoNLLDataset(
        config.dev_filename,
        processing_word,  ############ Here dev, test and train have the raw words and tags. Now we have to map these to corresponding word index
        processing_tag,
        config.max_iter
    )  ############ and tags index. Therefore, when we do model.evaluate in below lines, it calls run_evaluate in run_epoch function
    test = CoNLLDataset(config.test_filename, processing_word, processing_tag,
                        config.max_iter)
    train = CoNLLDataset(config.train_filename, processing_word,
                         processing_tag, config.max_iter)

    # build model
    model = NERModel(config,
                     embeddings,
                     ntags=len(vocab_tags),
                     nchars=len(vocab_chars))
    model.build()

    # train, evaluate and interact
    model.train(train, dev, vocab_tags)
    model.evaluate(test, vocab_tags)
Exemple #10
0
    def load(self):
        """Loads vocabulary, processing functions and embeddings
        """
        # 1. vocabulary
        self.vocab_words = load_vocab(self.filename_words)
        self.vocab_chars = load_vocab(self.filename_chars)

        self.nwords = len(self.vocab_words)
        self.nchars = len(self.vocab_chars)

        # 2. get processing functions that map str -> id
        self.processing_word = get_processing_word(self.vocab_words,
                                                   self.vocab_chars,
                                                   lowercase=True,
                                                   chars=self.use_chars)

        # 3. get pre-trained embeddings
        self.embeddings = (get_trimmed_glove_vectors(self.filename_trimmed)
                           if self.use_pretrained else None)
def main(config):
    # load vocabs
    vocab_words = load_vocab(config.words_filename)
    vocab_tags  = load_vocab(config.tags_filename)
    vocab_chars = load_vocab(config.chars_filename)
    dictionary = load_vocab("data/types.txt")
    types_dic = collections.OrderedDict([(v, k) for k, v in dictionary.items()])
    vocab_iob = {"O":0, "B":1, "I":2}
    vocab_type = load_vocab(config.types_filename)
    print vocab_type
    # get processing functions
    processing_word = get_processing_word(vocab_words, vocab_chars,
                    lowercase=True, chars=config.chars)
    processing_tag  = get_processing_word(vocab_tags, 
                    lowercase=False)
    processing_iob = get_processing_word(vocab_iob, 
                    lowercase=False)
    processing_type = get_processing_word(vocab_type, 
                    lowercase=False)

    # get pre trained embeddings
    embeddings = get_trimmed_glove_vectors(config.trimmed_filename)

    # create dataset
    dev   = CoNLLDataset(config.dev_filename, processing_word,
                        processing_tag, processing_iob, processing_type, config.max_iter, config.chars)
    test  = CoNLLDataset(config.test_filename, processing_word,
                        processing_tag, processing_iob, processing_type, config.max_iter, config.chars)
    train = CoNLLDataset(config.train_filename, processing_word,
                        processing_tag, processing_iob, processing_type, config.max_iter, config.chars)

    ntype = len(vocab_type)
    model = POSmodel(config, embeddings, ntags=len(vocab_tags),
                                         nchars=len(vocab_chars),
                                         niob=3,
                                         ntype=ntype)

    model.build()

    model.train(train, dev, vocab_type)

    model.evaluate(test, vocab_type)
Exemple #12
0
def train():
    preprocess()
    vocab, rev_vocab = data_utils.initialize_vocabulary(FLAGS.vocabulary_file)
    embeddings = data_utils.get_trimmed_glove_vectors(FLAGS.save_embedding_file)
    model = cnn_model.CNN(
        batch_size=FLAGS.batch_size,
        word_embedding=embeddings,
        sent_len=FLAGS.max_sentence_len,
        input_type=FLAGS.input_layer_type,
        word_num=len(rev_vocab),
        word_dim=FLAGS.embedding_dim,
        vocab=vocab,
        l2_alpha=FLAGS.l2_reg_lambda,
        dropout_prob=FLAGS.dropout_keep_prob,
        kernel_num=FLAGS.num_filters,
        learning_rate_base=FLAGS.learning_rate,
        epoch=FLAGS.num_epochs,
        model_path=FLAGS.model_path

    )
    train_data = data_utils.text_dataset('./input/data/train_data.ids', FLAGS.max_sentence_len)
    valid_data = data_utils.text_dataset('./input/data/valid_data.ids', FLAGS.max_sentence_len)
    print('train data size ={a}, valid data zize ={b}'.format(a=train_data.__len__(), b=valid_data.__len__()))
    model.train(train_data, valid_data)
Exemple #13
0
                                      vocab_syls,
                                      pos_tags,
                                      lowercase=True,
                                      chars=config.chars,
                                      morphs=config.morphs,
                                      posflag=config.posTag,
                                      pos_lm=config.posLM,
                                      dic_flag=config.dic_flag)
processing_tag = get_processing_word(vocab_tags, lowercase=False)
processing_pos = get_processing_word(pos_tags=pos_tags,
                                     posflag=True,
                                     lowercase=True,
                                     pos_lm=True)

# get pre trained embeddings
embeddings = get_trimmed_glove_vectors(config.trimmed_filename)
dic_embeddings = get_exported_dic_vectors(config.exported_filename)
morph_embeddings = get_exported_morph_vectors(config.exported_mfilename)
syl_embeddings = get_exported_dic_vectors(config.exported_sfilename)
pos_embeddings = get_exported_pos_vectors(config.exported_pfilename)

# create dataset
dev = CoNLLDataset(config.dev_filename, processing_word, processing_tag,
                   processing_pos, config.max_iter)
test = CoNLLDataset(config.test_filename, processing_word, processing_tag,
                    processing_pos, config.max_iter)
train = CoNLLDataset(config.train_filename, processing_word, processing_tag,
                     processing_pos, config.max_iter)

# build model
lmwords = len(vocab_words)
Exemple #14
0
# create instance of config
config = Config()

# load vocabs
vocab_words = load_vocab(config.words_filename)
vocab_tags  = load_vocab(config.tags_filename)
vocab_chars = load_vocab(config.chars_filename)

# get processing functions
processing_word = get_processing_word(vocab_words, vocab_chars,
                lowercase=True, chars=config.chars)
processing_tag  = get_processing_word(vocab_tags, 
                lowercase=False)

# get pre trained embeddings
embeddings = get_trimmed_glove_vectors(config.trimmed_filename)

# create dataset
dev   = CoNLLDataset(config.dev_filename, processing_word,
                    processing_tag, config.max_iter)
test  = CoNLLDataset(config.test_filename, processing_word,
                    processing_tag, config.max_iter)
train = CoNLLDataset(config.train_filename, processing_word,
                    processing_tag, config.max_iter)

# build model
model = NERModel(config, embeddings, ntags=len(vocab_tags),
                                     nchars=len(vocab_chars))
model.build()

# train, evaluate and interact
Exemple #15
0
                                                       self.batch_size)):
            cnt += 1
            acc = self.sess.run(self.accuracy,
                                feed_dict={
                                    self.input_x: data,
                                    self.input_y: y,
                                    self.dropout: 1.0
                                })
            acc_total += self.batch_size * acc
        acc_valid = round(acc_total * 1.0 / len(valid), 3)
        return acc_valid


if __name__ == '__main__':
    vocabulary_path = './input/data/vocabulary.txt'
    vocab, rev_vocab = data_utils.initialize_vocabulary(vocabulary_path)
    embed_path = './input/data/embed/glove.840B.300d.npz'
    embeddings = data_utils.get_trimmed_glove_vectors(embed_path)
    model = CNN(batch_size=10,
                word_embedding=embeddings,
                sent_len=100,
                input_type='CNN-static',
                word_num=len(rev_vocab),
                word_dim=300,
                vocab=vocab)
    train_data = data_utils.text_dataset('./input/data/train_data.ids', 100)
    valid_data = data_utils.text_dataset('./input/data/valid_data.ids', 100)
    print('train set={a},valid set={b}'.format(a=train_data.__len__(),
                                               b=valid_data.__len__()))
    model.train(train_data, valid_data)
Exemple #16
0
    vocab_joint_tags = load_vocab(config.joint_tags_filename)

    # get processing functions
    processing_word = get_processing_word(vocab_words,
                                          lowercase=config.lowercase)
    processing_pos = get_processing_word(vocab_poss, lowercase=False)
    processing_chunk = get_processing_word(vocab_chunks, lowercase=False)
    processing_aspect_tag = get_processing_word(vocab_aspect_tags,
                                                lowercase=False)
    processing_polarity_tag = get_processing_word(vocab_polarity_tags,
                                                  lowercase=False)
    processing_joint_tag = get_processing_word(vocab_joint_tags,
                                               lowercase=False)

    # get pre trained embeddings
    domain_embeddings = get_trimmed_glove_vectors(
        config.domain_trimmed_filename)
    general_embeddings = get_trimmed_glove_vectors(
        config.general_trimmed_filename)

    # create dataset
    dev = CoNLLDataset(config.dev_filename, processing_word, processing_pos,
                       processing_chunk, processing_aspect_tag,
                       processing_polarity_tag, processing_joint_tag,
                       config.max_iter)
    test = CoNLLDataset(config.test_filename, processing_word, processing_pos,
                        processing_chunk, processing_aspect_tag,
                        processing_polarity_tag, processing_joint_tag,
                        config.max_iter)
    train = CoNLLDataset(config.train_filename, processing_word,
                         processing_pos, processing_chunk,
                         processing_aspect_tag, processing_polarity_tag,