Exemple #1
0
def main():
    # create instance of config
    config = Config()
    config.filename_train = "../datasets/ritter2011/train"
    config.filename_dev = "../datasets/ritter2011/train"
    config.filename_test = "../datasets/ritter2011/train"

    config.filename_chars = config.filename_chars.replace("source", "target")
    config.filename_glove = config.filename_glove.replace("source", "target")
    config.filename_tags = config.filename_tags.replace("source", "target")
    config.filename_words = config.filename_words.replace("source", "target")

    config.dir_model = config.dir_model.replace("source", "target")
    config.dir_output = config.dir_output.replace("source", "target")
    config.path_log = config.path_log.replace("source", "target")


    os.environ["CUDA_VISIBLE_DEVICES"] = str(config.gpu_ids[0])
    # build model
    model = SAL_BLSTM_OAL_CRF_Model(config)
    model.build()
    model.restore_session("results/source/model.weights/")
    model.reinitialize_weights("proj")

    # create datasets
    train = NERDataset(config.filename_train, config.processing_word,
                       config.processing_tag, config.max_iter)

    dev   = NERDataset(config.filename_dev, config.processing_word,
                       config.processing_tag, config.max_iter)
    # train model
    model.train(train, dev)
Exemple #2
0
def main(warmup=False):
    # create instance of config
    config = Config(load=False)

    source_dataset = "o"
    target_dataset = "r"
    config.batch_size = 10
    config.filename_train = "../datasets/%s/train_bioes" % datasets[
        target_dataset]
    config.filename_dev = "../datasets/%s/dev_bioes" % datasets[target_dataset]
    config.filename_test = "../datasets/%s/test_bioes" % datasets[
        target_dataset]

    # Enable the below line only when you are using different embs.
    # Make sure you have run the "python prep_data.py source_dataset target_dataset"
    # Make sure you have run the "python prep_data.py target_dataset target_dataset"
    config.filename_words = "../datasets/%s/words.txt" % datasets[
        source_dataset]
    config.filename_chars = "../datasets/%s/chars.txt" % datasets[
        source_dataset]
    config.filename_tags = "../datasets/%s/tags.txt" % datasets[target_dataset]
    config.filename_trimmed = config.filename_trimmed.replace(
        "dataset_name", datasets[source_dataset])

    config.dir_model = config.dir_model.replace("/source", "/target")
    config.dir_output = config.dir_output.replace("/source", "/target")
    config.path_log = config.path_log.replace("/source", "/target")
    config.oal_hidden_size_lstm = 100
    config.psi = 1
    config.load()

    # create datasets
    train = NERDataset(config.filename_train, config.processing_word,
                       config.processing_tag, config.max_iter)

    dev = NERDataset(config.filename_dev, config.processing_word,
                     config.processing_tag, config.max_iter)
    if config.gpu_ids:
        os.environ["CUDA_VISIBLE_DEVICES"] = str(config.gpu_ids[2])

    if warmup == "none":
        config.lr_method = "adam"
        config.lr = 0.001
        config.lr_decay = 1
        config.batch_size = 10
        config.psi = 1
        config.nepochs = 50
        model = SAL_BLSTM_OAL_CRF_Model(config)
        model.build()
        model.restore_session("results/source/model.weights/",
                              transfer_mode=True)
        model.train(train, dev)
Exemple #3
0
def main():
    # get config and processing of words
    config = Config(load=False)
    # should be source_x.txt

    # or ontonotes-nw if you like

    config.filename_train = "../datasets/ritter2011/train"
    config.filename_dev = "../datasets/ritter2011/dev"
    config.filename_test = "../datasets/ritter2011/test"

    config.filename_chars = config.filename_chars.replace("source", "target")
    config.filename_glove = config.filename_glove.replace("source", "target")
    config.filename_tags = config.filename_tags.replace("source", "target")
    config.filename_words = config.filename_words.replace("source", "target")

    config.dir_model = config.dir_model.replace("source", "target")
    config.dir_output = config.dir_output.replace("source", "target")
    config.path_log = config.path_log.replace("source", "target")

    processing_word = get_processing_word(lowercase=True)

    # Generators
    dev = NERDataset(config.filename_dev, processing_word)
    test = NERDataset(config.filename_test, processing_word)
    train = NERDataset(config.filename_train, processing_word)

    # Build Word and Tag vocab
    vocab_words, vocab_tags = get_vocabs([train, dev, test])
    vocab_glove = get_glove_vocab(config.filename_glove)

    vocab = vocab_words & vocab_glove
    vocab.add(UNK)
    vocab.add(NUM)
    vocab_tags.add(UNK)

    # Save vocab
    write_vocab(vocab, config.filename_words)
    write_vocab(vocab_tags, config.filename_tags)

    # Trim Word Vectors
    vocab = load_vocab(config.filename_words)
    export_trimmed_glove_vectors(vocab, config.filename_glove,
                                 config.filename_trimmed, config.dim_word)

    # Build and save char vocab
    train = NERDataset(config.filename_train)
    vocab_chars = get_char_vocab(train)
    write_vocab(vocab_chars, config.filename_chars)
Exemple #4
0
def get_vocabs_from_dataset(dataset):

    filename_train = "../datasets/%s/train_bioes"%datasets[dataset]
    filename_dev = "../datasets/%s/dev_bioes"%datasets[dataset]
    filename_test = "../datasets/%s/test_bioes"%datasets[dataset]


    processing_word = get_processing_word(lowercase=True)
    # Generators
    dev   = NERDataset(filename_dev, processing_word)
    test  = NERDataset(filename_test, processing_word)
    train = NERDataset(filename_train, processing_word)

    vocab_words, vocab_tags = get_vocabs([train, dev, test])
    return vocab_words, vocab_tags
Exemple #5
0
def main():
    # create instance of config
    config = Config()
    config.filename_train = "../datasets/ritter2011/train"
    config.filename_dev = "../datasets/ritter2011/train"
    config.filename_test = "../datasets/ritter2011/train"

    config.filename_chars = config.filename_chars.replace("source", "target")
    config.filename_glove = config.filename_glove.replace("source", "target")
    config.filename_tags = config.filename_tags.replace("source", "target")
    config.filename_words = config.filename_words.replace("source", "target")

    config.dir_model = config.dir_model.replace("source", "target")
    config.dir_output = config.dir_output.replace("source", "target")
    config.path_log = config.path_log.replace("source", "target")

    os.environ["CUDA_VISIBLE_DEVICES"] = str(config.gpu_ids[0])
    # build model
    model = SAL_BLSTM_OAL_CRF_Model(config)
    model.build()
    model.restore_session(config.dir_model)

    # create dataset
    test = NERDataset(config.filename_test, config.processing_word,
                      config.processing_tag, config.max_iter)

    # evaluate and interact
    model.evaluate(test)
Exemple #6
0
def main():
    # create instance of config
    config = Config()
    config.filename_train = "../datasets/conll2003/train"
    config.filename_dev = "../datasets/conll2003/train"
    config.filename_test = "../datasets/conll2003/train"

    os.environ["CUDA_VISIBLE_DEVICES"] = str(config.gpu_ids[0])
    # build model
    model = BLSTM_CRF_Model(config)
    model.build()

    # create datasets
    train = NERDataset(config.filename_train, config.processing_word,
                       config.processing_tag, config.max_iter)

    dev = NERDataset(config.filename_dev, config.processing_word,
                     config.processing_tag, config.max_iter)
    # train model
    model.train(train, dev)
Exemple #7
0
def main():
    # get config and processing of words
    config = Config(load=False)
    # should be source_x.txt

    # or ontonotes-nw if you like

    config.filename_train = "../datasets/ontonotes-nw/train"
    config.filename_dev = "../datasets/ontonotes-nw/dev"
    config.filename_test = "../datasets/ontonotes-nw/test"

    processing_word = get_processing_word(lowercase=True)

    # Generators
    dev = NERDataset(config.filename_dev, processing_word)
    test = NERDataset(config.filename_test, processing_word)
    train = NERDataset(config.filename_train, processing_word)
    #for word, tag in train:
    #print("word:{}".format(word))
    #print ("tag:{}".format(tag))
    # Build Word and Tag vocab
    vocab_words, vocab_tags = get_vocabs([train, dev, test])
    vocab_glove = get_glove_vocab(config.filename_glove)

    vocab = vocab_words & vocab_glove
    vocab.add(UNK)
    vocab.add(NUM)
    vocab_tags.add(UNK)
    # Save vocab
    write_vocab(vocab, config.filename_words)
    write_vocab(vocab_tags, config.filename_tags)

    # Trim Word Vectors
    vocab = load_vocab(config.filename_words)
    export_trimmed_glove_vectors(vocab, config.filename_glove,
                                 config.filename_trimmed, config.dim_word)

    # Build and save char vocab
    train = NERDataset(config.filename_train)
    vocab_chars = get_char_vocab(train)
    write_vocab(vocab_chars, config.filename_chars)
Exemple #8
0
def main():
    # create instance of config
    config = Config(load=False)

    source_dataset = "o"
    config.batch_size = 50
    config.nepochs = 10
    config.filename_train = "../datasets/%s/train_bioes" % datasets[
        source_dataset]
    config.filename_dev = "../datasets/%s/dev_bioes" % datasets[source_dataset]
    config.filename_test = "../datasets/%s/test_bioes" % datasets[
        source_dataset]

    config.filename_words = "../datasets/%s/words.txt" % datasets[
        source_dataset]
    config.filename_tags = "../datasets/%s/tags.txt" % datasets[source_dataset]
    config.filename_chars = "../datasets/%s/chars.txt" % datasets[
        source_dataset]

    config.filename_trimmed = config.filename_trimmed.replace(
        "dataset_name", datasets[source_dataset])
    config.load()

    if config.gpu_ids:
        print("using gpu ids:", config.gpu_ids)
        os.environ["CUDA_VISIBLE_DEVICES"] = str(config.gpu_ids[4])
    # build model
    model = BLSTM_CRF_Model(config)
    model.build()

    # create datasets
    train = NERDataset(config.filename_train, config.processing_word,
                       config.processing_tag, config.max_iter)

    dev = NERDataset(config.filename_dev, config.processing_word,
                     config.processing_tag, config.max_iter)
    # train model
    model.train(train, dev)