def main(): # create instance of config config = Config() config.filename_train = "../datasets/ritter2011/train" config.filename_dev = "../datasets/ritter2011/train" config.filename_test = "../datasets/ritter2011/train" config.filename_chars = config.filename_chars.replace("source", "target") config.filename_glove = config.filename_glove.replace("source", "target") config.filename_tags = config.filename_tags.replace("source", "target") config.filename_words = config.filename_words.replace("source", "target") config.dir_model = config.dir_model.replace("source", "target") config.dir_output = config.dir_output.replace("source", "target") config.path_log = config.path_log.replace("source", "target") os.environ["CUDA_VISIBLE_DEVICES"] = str(config.gpu_ids[0]) # build model model = SAL_BLSTM_OAL_CRF_Model(config) model.build() model.restore_session("results/source/model.weights/") model.reinitialize_weights("proj") # create datasets train = NERDataset(config.filename_train, config.processing_word, config.processing_tag, config.max_iter) dev = NERDataset(config.filename_dev, config.processing_word, config.processing_tag, config.max_iter) # train model model.train(train, dev)
def main(warmup=False): # create instance of config config = Config(load=False) source_dataset = "o" target_dataset = "r" config.batch_size = 10 config.filename_train = "../datasets/%s/train_bioes" % datasets[ target_dataset] config.filename_dev = "../datasets/%s/dev_bioes" % datasets[target_dataset] config.filename_test = "../datasets/%s/test_bioes" % datasets[ target_dataset] # Enable the below line only when you are using different embs. # Make sure you have run the "python prep_data.py source_dataset target_dataset" # Make sure you have run the "python prep_data.py target_dataset target_dataset" config.filename_words = "../datasets/%s/words.txt" % datasets[ source_dataset] config.filename_chars = "../datasets/%s/chars.txt" % datasets[ source_dataset] config.filename_tags = "../datasets/%s/tags.txt" % datasets[target_dataset] config.filename_trimmed = config.filename_trimmed.replace( "dataset_name", datasets[source_dataset]) config.dir_model = config.dir_model.replace("/source", "/target") config.dir_output = config.dir_output.replace("/source", "/target") config.path_log = config.path_log.replace("/source", "/target") config.oal_hidden_size_lstm = 100 config.psi = 1 config.load() # create datasets train = NERDataset(config.filename_train, config.processing_word, config.processing_tag, config.max_iter) dev = NERDataset(config.filename_dev, config.processing_word, config.processing_tag, config.max_iter) if config.gpu_ids: os.environ["CUDA_VISIBLE_DEVICES"] = str(config.gpu_ids[2]) if warmup == "none": config.lr_method = "adam" config.lr = 0.001 config.lr_decay = 1 config.batch_size = 10 config.psi = 1 config.nepochs = 50 model = SAL_BLSTM_OAL_CRF_Model(config) model.build() model.restore_session("results/source/model.weights/", transfer_mode=True) model.train(train, dev)
def main(): # get config and processing of words config = Config(load=False) # should be source_x.txt # or ontonotes-nw if you like config.filename_train = "../datasets/ritter2011/train" config.filename_dev = "../datasets/ritter2011/dev" config.filename_test = "../datasets/ritter2011/test" config.filename_chars = config.filename_chars.replace("source", "target") config.filename_glove = config.filename_glove.replace("source", "target") config.filename_tags = config.filename_tags.replace("source", "target") config.filename_words = config.filename_words.replace("source", "target") config.dir_model = config.dir_model.replace("source", "target") config.dir_output = config.dir_output.replace("source", "target") config.path_log = config.path_log.replace("source", "target") processing_word = get_processing_word(lowercase=True) # Generators dev = NERDataset(config.filename_dev, processing_word) test = NERDataset(config.filename_test, processing_word) train = NERDataset(config.filename_train, processing_word) # Build Word and Tag vocab vocab_words, vocab_tags = get_vocabs([train, dev, test]) vocab_glove = get_glove_vocab(config.filename_glove) vocab = vocab_words & vocab_glove vocab.add(UNK) vocab.add(NUM) vocab_tags.add(UNK) # Save vocab write_vocab(vocab, config.filename_words) write_vocab(vocab_tags, config.filename_tags) # Trim Word Vectors vocab = load_vocab(config.filename_words) export_trimmed_glove_vectors(vocab, config.filename_glove, config.filename_trimmed, config.dim_word) # Build and save char vocab train = NERDataset(config.filename_train) vocab_chars = get_char_vocab(train) write_vocab(vocab_chars, config.filename_chars)
def get_vocabs_from_dataset(dataset): filename_train = "../datasets/%s/train_bioes"%datasets[dataset] filename_dev = "../datasets/%s/dev_bioes"%datasets[dataset] filename_test = "../datasets/%s/test_bioes"%datasets[dataset] processing_word = get_processing_word(lowercase=True) # Generators dev = NERDataset(filename_dev, processing_word) test = NERDataset(filename_test, processing_word) train = NERDataset(filename_train, processing_word) vocab_words, vocab_tags = get_vocabs([train, dev, test]) return vocab_words, vocab_tags
def main(): # create instance of config config = Config() config.filename_train = "../datasets/ritter2011/train" config.filename_dev = "../datasets/ritter2011/train" config.filename_test = "../datasets/ritter2011/train" config.filename_chars = config.filename_chars.replace("source", "target") config.filename_glove = config.filename_glove.replace("source", "target") config.filename_tags = config.filename_tags.replace("source", "target") config.filename_words = config.filename_words.replace("source", "target") config.dir_model = config.dir_model.replace("source", "target") config.dir_output = config.dir_output.replace("source", "target") config.path_log = config.path_log.replace("source", "target") os.environ["CUDA_VISIBLE_DEVICES"] = str(config.gpu_ids[0]) # build model model = SAL_BLSTM_OAL_CRF_Model(config) model.build() model.restore_session(config.dir_model) # create dataset test = NERDataset(config.filename_test, config.processing_word, config.processing_tag, config.max_iter) # evaluate and interact model.evaluate(test)
def main(): # create instance of config config = Config() config.filename_train = "../datasets/conll2003/train" config.filename_dev = "../datasets/conll2003/train" config.filename_test = "../datasets/conll2003/train" os.environ["CUDA_VISIBLE_DEVICES"] = str(config.gpu_ids[0]) # build model model = BLSTM_CRF_Model(config) model.build() # create datasets train = NERDataset(config.filename_train, config.processing_word, config.processing_tag, config.max_iter) dev = NERDataset(config.filename_dev, config.processing_word, config.processing_tag, config.max_iter) # train model model.train(train, dev)
def main(): # get config and processing of words config = Config(load=False) # should be source_x.txt # or ontonotes-nw if you like config.filename_train = "../datasets/ontonotes-nw/train" config.filename_dev = "../datasets/ontonotes-nw/dev" config.filename_test = "../datasets/ontonotes-nw/test" processing_word = get_processing_word(lowercase=True) # Generators dev = NERDataset(config.filename_dev, processing_word) test = NERDataset(config.filename_test, processing_word) train = NERDataset(config.filename_train, processing_word) #for word, tag in train: #print("word:{}".format(word)) #print ("tag:{}".format(tag)) # Build Word and Tag vocab vocab_words, vocab_tags = get_vocabs([train, dev, test]) vocab_glove = get_glove_vocab(config.filename_glove) vocab = vocab_words & vocab_glove vocab.add(UNK) vocab.add(NUM) vocab_tags.add(UNK) # Save vocab write_vocab(vocab, config.filename_words) write_vocab(vocab_tags, config.filename_tags) # Trim Word Vectors vocab = load_vocab(config.filename_words) export_trimmed_glove_vectors(vocab, config.filename_glove, config.filename_trimmed, config.dim_word) # Build and save char vocab train = NERDataset(config.filename_train) vocab_chars = get_char_vocab(train) write_vocab(vocab_chars, config.filename_chars)
def main(): # create instance of config config = Config(load=False) source_dataset = "o" config.batch_size = 50 config.nepochs = 10 config.filename_train = "../datasets/%s/train_bioes" % datasets[ source_dataset] config.filename_dev = "../datasets/%s/dev_bioes" % datasets[source_dataset] config.filename_test = "../datasets/%s/test_bioes" % datasets[ source_dataset] config.filename_words = "../datasets/%s/words.txt" % datasets[ source_dataset] config.filename_tags = "../datasets/%s/tags.txt" % datasets[source_dataset] config.filename_chars = "../datasets/%s/chars.txt" % datasets[ source_dataset] config.filename_trimmed = config.filename_trimmed.replace( "dataset_name", datasets[source_dataset]) config.load() if config.gpu_ids: print("using gpu ids:", config.gpu_ids) os.environ["CUDA_VISIBLE_DEVICES"] = str(config.gpu_ids[4]) # build model model = BLSTM_CRF_Model(config) model.build() # create datasets train = NERDataset(config.filename_train, config.processing_word, config.processing_tag, config.max_iter) dev = NERDataset(config.filename_dev, config.processing_word, config.processing_tag, config.max_iter) # train model model.train(train, dev)