def main(config): # load vocabs vocab_words, idx2words = load_vocab(config.words_filename) vocab_tags, _ = load_vocab(config.tags_filename) vocab_chars, _ = load_vocab(config.chars_filename) vocab_pos, _ = load_vocab(config.pos_filename) # get processing functions processing_word = get_processing_word(vocab_words, vocab_chars, lowercase=True, chars=config.chars) processing_tag = get_processing_word(vocab_tags, lowercase=False) processing_pos = get_processing_word(vocab_pos, lowercase=False) # get pre trained embeddings embeddings = get_trimmed_glove_vectors(config.trimmed_filename) embeddings_uni = get_trimmed_glove_vectors(config.uni_trimmed_filename) pos_embeddings = get_trimmed_glove_vectors(config.feature_trimmed_filename) NE_dic = get_trimmed_glove_vectors(config.trimmed_dic) # create dataset dev = CoNLLDataset(config.dev_filename, processing_word, processing_tag, processing_pos, config.max_iter) train = CoNLLDataset(config.train_filename, processing_word, processing_tag, processing_pos, config.max_iter) # build model model = NERModel(config, embeddings, embeddings_uni, pos_embeddings, ntags=len(vocab_tags), nchars=len(vocab_chars), vocab_words=idx2words, NE_dic=NE_dic) model.build() # train, evaluate and interact if state == "train": model.train(train, dev, vocab_tags) elif state == "evaluate": model.evaluate(dev, vocab_tags) else: #state == predict convert(file) t2o("data_format/test_convert.txt","data_format/test.txt") test = CoNLLDataset(config.test_filename, processing_word, processing_tag, processing_pos, config.max_iter) model.evaluate(test, vocab_tags) tagging("data_format/test_convert.txt")
def main(config): # load vocabs vocab_words = load_vocab(config.words_filename) vocab_mor_tags = load_vocab(config.mor_tags_filename) vocab_tags = load_vocab(config.tags_filename) vocab_chars = load_vocab(config.chars_filename) vocab_lex_tags = load_vocab(config.lex_tags_filename) # get processing functions processing_word = get_processing_word(vocab_words, vocab_chars, lowercase=True, chars=config.chars) processing_mor_tag = get_processing_word(vocab_mor_tags, lowercase=False) processing_tag = get_processing_word(vocab_tags, lowercase=False) processing_lex_tag = get_processing_word(vocab_lex_tags, lowercase=False) # get pre trained embeddings embeddings = get_trimmed_glove_vectors(config.trimmed_filename) cnn_model = CnnLstmCrfModel(config, embeddings, ntags=len(vocab_tags), nchars=len(vocab_chars)) cnn_model.build() cnn_model.write_tag_result_test(vocab_tags, processing_word, processing_mor_tag, processing_lex_tag)
def main(config): # load vocabs vocab_words = load_vocab(config.words_filename) vocab_mor_tags = load_vocab(config.mor_tags_filename) vocab_tags = load_vocab(config.tags_filename) vocab_chars = load_vocab(config.chars_filename) vocab_lex_tags = load_vocab(config.lex_tags_filename) # get processing functions processing_word = get_processing_word(vocab_words, vocab_chars, lowercase=True, chars=config.chars) processing_mor_tag = get_processing_word(vocab_mor_tags, lowercase=False) processing_tag = get_processing_word(vocab_tags, lowercase=False) processing_lex_tag = get_processing_word(vocab_lex_tags, lowercase=False) # get pre trained embeddings embeddings = get_trimmed_glove_vectors(config.trimmed_filename) # create dataset dev = Data(config.dev_filename, processing_word, processing_mor_tag, processing_lex_tag, processing_tag, config.max_iter) test = Data(config.test_filename, processing_word, processing_mor_tag, processing_lex_tag, processing_tag, config.max_iter) train = Data(config.train_filename, processing_word, processing_mor_tag, processing_lex_tag, processing_tag, config.max_iter) cnn_model = CnnLstmCrfModel(config, embeddings, ntags=len(vocab_tags), nchars=len(vocab_chars)) cnn_model.build() cnn_model.train(train, dev, vocab_tags) cnn_model.evaluate(test, vocab_tags)
def main(config): # load vocabs vocab_words = load_vocab(config.words_filename) vocab_chars = load_vocab(config.chars_filename) # get processing functions processing_word = get_processing_word(vocab_words, vocab_chars, lowercase=True, chars=True) # get pre trained embeddings embeddings = get_trimmed_glove_vectors(config.trimmed_filename) # create dataset dev = AnnotationDataset(config.dev_filename, processing_word) test = AnnotationDataset(config.test_filename, processing_word) train = AnnotationDataset(config.train_filename, processing_word) print("Num. train: %d" % len(train)) print("Num. test: %d" % len(test)) print("Num. dev: %d" % len(dev)) model = WImpModel(config, embeddings, ntags=config.nclass, nchars=len(vocab_chars)) # build WImpModel model.build_graph() # train, evaluate and interact model.train(train, dev) model.evaluate(test)
def main(config): # load vocabs vocab_words = load_vocab(config.words_filename) vocab_tags = load_vocab(config.tags_filename) vocab_chars = load_vocab(config.chars_filename) # get processing functions processing_word = get_processing_word(vocab_words, vocab_chars, lowercase=True, chars=config.chars) processing_tag = get_processing_word(vocab_tags, lowercase=False, allow_unk=False) # get pre trained embeddings embeddings = get_trimmed_glove_vectors(config.trimmed_filename) # create dataset dev = CoNLLDataset(config.dev_filename, processing_word, processing_tag, config.max_iter) test = CoNLLDataset(config.test_filename, processing_word, processing_tag, config.max_iter) train = CoNLLDataset(config.train_filename, processing_word, processing_tag, config.max_iter) # build model model = NERModel(config, embeddings, ntags=len(vocab_tags), nchars=len(vocab_chars)) model.build() # train, evaluate and interact model.train(train, dev, vocab_tags) model.evaluate(test, vocab_tags) model.interactive_shell(vocab_tags, processing_word)
def __init__(self, config): self.config = config self.vocab_words = load_vocab(self.config.filename_words) self.vocab_tags = load_vocab(self.config.filename_tags) self.vocab_chars = load_vocab(self.config.filename_chars) # Get pre-trained embeddings self.w_embeddings = (get_trimmed_glove_vectors(config.filename_trimmed) if self.config.use_pretrained else None)
class nlu(): # load vocabs vocab_words = load_vocab(config.words_filename) vocab_tags = load_vocab(config.tags_filename) # get processing functions embeddings = get_trimmed_glove_vectors(config.trimmed_filename) # get logger # logger = get_logger(config.log_path) # build model model = NERModel(config, embeddings, ntags=len(vocab_tags), logger=None) model.build() idx_to_tag = {idx: tag for tag, idx in vocab_tags.items()} saver = tf.train.Saver() sess = tf.Session() saver.restore(sess, config.model_output) # model.logger.info("This is an interactive mode, enter a sentence:") @staticmethod def rec(sentence): try: processing_word = get_processing_word(nlu.vocab_words, lowercase=config.lowercase) # print character_separation(sentence)[0] words_raw = character_separation(sentence)[0].split(' ') # for word in words_raw: # if type(word)==str: words_raw = [unicode(word, 'utf-8') for word in words_raw] # words_raw = [word.decode('utf-8') for word in words_raw] # else: # words_raw = [unicode(word, 'utf-8') for word in words_raw] words = map(processing_word, words_raw) words = list(words) pred_ids, _ = nlu.model.predict_batch(nlu.sess, [words]) preds = map(lambda idx: nlu.idx_to_tag[idx], list(pred_ids[0])) # print(list(preds)) print_sentence(nlu.model.logger, {"x": words_raw, "y": preds}) return list(preds) except EOFError: print("Closing session.") # nlu.rec('请播放电视剧三生三世十里桃花') # nlu.rec('请播放电视剧三生三世十里桃花') # nlu.rec('请播放电视剧三生三世十里桃花')
def main(config): # load vocabs vocab_words = load_vocab(config.words_filename) vocab_tags = load_vocab(config.tags_filename) vocab_chars = load_vocab(config.chars_filename) vocab_iob = {"O": 0, "B": 1, "I": 2} vocab_type = {"LOC": 0, "PER": 1, "ORG": 2, "MISC": 3} # get processing functions processing_word = get_processing_word(vocab_words, vocab_chars, lowercase=True, chars=config.chars) processing_tag = get_processing_word(vocab_tags, lowercase=False) processing_iob = get_processing_word(vocab_iob, lowercase=False) processing_type = get_processing_word(vocab_type, lowercase=False) # get pre trained embeddings embeddings = get_trimmed_glove_vectors(config.trimmed_filename) # create dataset dev = CoNLLDataset(config.dev_filename, processing_word, processing_tag, processing_iob, processing_type, config.max_iter, config.chars) test = CoNLLDataset(config.test_filename, processing_word, processing_tag, processing_iob, processing_type, config.max_iter, config.chars) train = CoNLLDataset(config.train_filename, processing_word, processing_tag, processing_iob, processing_type, config.max_iter, config.chars) model = NERModel(config, embeddings, ntags=len(vocab_tags), nchars=len(vocab_chars), niob=3, ntype=4) model.build() # train, evaluate and interact print vocab_tags model.train(train, dev, vocab_tags) stime = time.time() model.evaluate(test, vocab_tags) print time.time() - stime
def main(config): # load vocabs vocab_words = load_vocab(config.words_filename) vocab_tags = load_vocab(config.tags_filename) vocab_chars = load_vocab(config.chars_filename) vocab_pref_suff = load_vocab( config.PS_filename) ############### For prefix and suffix vocab_pref_suff_2 = load_vocab(config.PS_filename_2) vocab_pref_suff_4 = load_vocab(config.PS_filename_4) # get processing functions processing_word = get_processing_word(vocab_words, vocab_chars, vocab_pref_suff, vocab_pref_suff_2, vocab_pref_suff_4, lowercase=True, chars=config.chars, Pref_Suff=config.pref_suff) processing_tag = get_processing_word(vocab_tags, lowercase=False, Geoparser=True) # get pre trained embeddings embeddings = get_trimmed_glove_vectors(config.trimmed_filename) ##create dataset dev = CoNLLDataset( config.dev_filename, processing_word, ############ Here dev, test and train have the raw words and tags. Now we have to map these to corresponding word index processing_tag, config.max_iter ) ############ and tags index. Therefore, when we do model.evaluate in below lines, it calls run_evaluate in run_epoch function test = CoNLLDataset(config.test_filename, processing_word, processing_tag, config.max_iter) train = CoNLLDataset(config.train_filename, processing_word, processing_tag, config.max_iter) # build model model = NERModel(config, embeddings, ntags=len(vocab_tags), nchars=len(vocab_chars)) model.build() # train, evaluate and interact model.train(train, dev, vocab_tags) model.evaluate(test, vocab_tags)
def load(self): """Loads vocabulary, processing functions and embeddings """ # 1. vocabulary self.vocab_words = load_vocab(self.filename_words) self.vocab_chars = load_vocab(self.filename_chars) self.nwords = len(self.vocab_words) self.nchars = len(self.vocab_chars) # 2. get processing functions that map str -> id self.processing_word = get_processing_word(self.vocab_words, self.vocab_chars, lowercase=True, chars=self.use_chars) # 3. get pre-trained embeddings self.embeddings = (get_trimmed_glove_vectors(self.filename_trimmed) if self.use_pretrained else None)
def main(config): # load vocabs vocab_words = load_vocab(config.words_filename) vocab_tags = load_vocab(config.tags_filename) vocab_chars = load_vocab(config.chars_filename) dictionary = load_vocab("data/types.txt") types_dic = collections.OrderedDict([(v, k) for k, v in dictionary.items()]) vocab_iob = {"O":0, "B":1, "I":2} vocab_type = load_vocab(config.types_filename) print vocab_type # get processing functions processing_word = get_processing_word(vocab_words, vocab_chars, lowercase=True, chars=config.chars) processing_tag = get_processing_word(vocab_tags, lowercase=False) processing_iob = get_processing_word(vocab_iob, lowercase=False) processing_type = get_processing_word(vocab_type, lowercase=False) # get pre trained embeddings embeddings = get_trimmed_glove_vectors(config.trimmed_filename) # create dataset dev = CoNLLDataset(config.dev_filename, processing_word, processing_tag, processing_iob, processing_type, config.max_iter, config.chars) test = CoNLLDataset(config.test_filename, processing_word, processing_tag, processing_iob, processing_type, config.max_iter, config.chars) train = CoNLLDataset(config.train_filename, processing_word, processing_tag, processing_iob, processing_type, config.max_iter, config.chars) ntype = len(vocab_type) model = POSmodel(config, embeddings, ntags=len(vocab_tags), nchars=len(vocab_chars), niob=3, ntype=ntype) model.build() model.train(train, dev, vocab_type) model.evaluate(test, vocab_type)
def train(): preprocess() vocab, rev_vocab = data_utils.initialize_vocabulary(FLAGS.vocabulary_file) embeddings = data_utils.get_trimmed_glove_vectors(FLAGS.save_embedding_file) model = cnn_model.CNN( batch_size=FLAGS.batch_size, word_embedding=embeddings, sent_len=FLAGS.max_sentence_len, input_type=FLAGS.input_layer_type, word_num=len(rev_vocab), word_dim=FLAGS.embedding_dim, vocab=vocab, l2_alpha=FLAGS.l2_reg_lambda, dropout_prob=FLAGS.dropout_keep_prob, kernel_num=FLAGS.num_filters, learning_rate_base=FLAGS.learning_rate, epoch=FLAGS.num_epochs, model_path=FLAGS.model_path ) train_data = data_utils.text_dataset('./input/data/train_data.ids', FLAGS.max_sentence_len) valid_data = data_utils.text_dataset('./input/data/valid_data.ids', FLAGS.max_sentence_len) print('train data size ={a}, valid data zize ={b}'.format(a=train_data.__len__(), b=valid_data.__len__())) model.train(train_data, valid_data)
vocab_syls, pos_tags, lowercase=True, chars=config.chars, morphs=config.morphs, posflag=config.posTag, pos_lm=config.posLM, dic_flag=config.dic_flag) processing_tag = get_processing_word(vocab_tags, lowercase=False) processing_pos = get_processing_word(pos_tags=pos_tags, posflag=True, lowercase=True, pos_lm=True) # get pre trained embeddings embeddings = get_trimmed_glove_vectors(config.trimmed_filename) dic_embeddings = get_exported_dic_vectors(config.exported_filename) morph_embeddings = get_exported_morph_vectors(config.exported_mfilename) syl_embeddings = get_exported_dic_vectors(config.exported_sfilename) pos_embeddings = get_exported_pos_vectors(config.exported_pfilename) # create dataset dev = CoNLLDataset(config.dev_filename, processing_word, processing_tag, processing_pos, config.max_iter) test = CoNLLDataset(config.test_filename, processing_word, processing_tag, processing_pos, config.max_iter) train = CoNLLDataset(config.train_filename, processing_word, processing_tag, processing_pos, config.max_iter) # build model lmwords = len(vocab_words)
# create instance of config config = Config() # load vocabs vocab_words = load_vocab(config.words_filename) vocab_tags = load_vocab(config.tags_filename) vocab_chars = load_vocab(config.chars_filename) # get processing functions processing_word = get_processing_word(vocab_words, vocab_chars, lowercase=True, chars=config.chars) processing_tag = get_processing_word(vocab_tags, lowercase=False) # get pre trained embeddings embeddings = get_trimmed_glove_vectors(config.trimmed_filename) # create dataset dev = CoNLLDataset(config.dev_filename, processing_word, processing_tag, config.max_iter) test = CoNLLDataset(config.test_filename, processing_word, processing_tag, config.max_iter) train = CoNLLDataset(config.train_filename, processing_word, processing_tag, config.max_iter) # build model model = NERModel(config, embeddings, ntags=len(vocab_tags), nchars=len(vocab_chars)) model.build() # train, evaluate and interact
self.batch_size)): cnt += 1 acc = self.sess.run(self.accuracy, feed_dict={ self.input_x: data, self.input_y: y, self.dropout: 1.0 }) acc_total += self.batch_size * acc acc_valid = round(acc_total * 1.0 / len(valid), 3) return acc_valid if __name__ == '__main__': vocabulary_path = './input/data/vocabulary.txt' vocab, rev_vocab = data_utils.initialize_vocabulary(vocabulary_path) embed_path = './input/data/embed/glove.840B.300d.npz' embeddings = data_utils.get_trimmed_glove_vectors(embed_path) model = CNN(batch_size=10, word_embedding=embeddings, sent_len=100, input_type='CNN-static', word_num=len(rev_vocab), word_dim=300, vocab=vocab) train_data = data_utils.text_dataset('./input/data/train_data.ids', 100) valid_data = data_utils.text_dataset('./input/data/valid_data.ids', 100) print('train set={a},valid set={b}'.format(a=train_data.__len__(), b=valid_data.__len__())) model.train(train_data, valid_data)
vocab_joint_tags = load_vocab(config.joint_tags_filename) # get processing functions processing_word = get_processing_word(vocab_words, lowercase=config.lowercase) processing_pos = get_processing_word(vocab_poss, lowercase=False) processing_chunk = get_processing_word(vocab_chunks, lowercase=False) processing_aspect_tag = get_processing_word(vocab_aspect_tags, lowercase=False) processing_polarity_tag = get_processing_word(vocab_polarity_tags, lowercase=False) processing_joint_tag = get_processing_word(vocab_joint_tags, lowercase=False) # get pre trained embeddings domain_embeddings = get_trimmed_glove_vectors( config.domain_trimmed_filename) general_embeddings = get_trimmed_glove_vectors( config.general_trimmed_filename) # create dataset dev = CoNLLDataset(config.dev_filename, processing_word, processing_pos, processing_chunk, processing_aspect_tag, processing_polarity_tag, processing_joint_tag, config.max_iter) test = CoNLLDataset(config.test_filename, processing_word, processing_pos, processing_chunk, processing_aspect_tag, processing_polarity_tag, processing_joint_tag, config.max_iter) train = CoNLLDataset(config.train_filename, processing_word, processing_pos, processing_chunk, processing_aspect_tag, processing_polarity_tag,