def load(self):
        """Loads vocabulary, processing functions and embeddings

        Supposes that build_data.py has been run successfully and that
        the corresponding files have been created (vocab and trimmed GloVe
        vectors)

        """
        # 1. vocabulary
        self.vocab_words = load_vocab(self.filename_words)
        self.vocab_tags = load_vocab(self.filename_tags)
        self.vocab_chars = load_vocab(self.filename_chars)

        self.nwords = len(self.vocab_words)
        self.nchars = len(self.vocab_chars)
        self.ntags = len(self.vocab_tags)

        # 2. get processing functions that map str -> id
        self.processing_word = get_processing_word(self.vocab_words,
                                                   self.vocab_chars,
                                                   lowercase=True,
                                                   chars=self.use_chars)
        self.processing_tag = get_processing_word(self.vocab_tags,
                                                  lowercase=False,
                                                  allow_unk=False)

        # 3. get pre-trained embeddings
        self.embeddings = (get_trimmed_glove_vectors(self.filename_trimmed)
                           if self.use_pretrained else None)
Ejemplo n.º 2
0
def train():

    word_to_id, id_to_word = load_vocab(args.vocab_file)
    tag_to_id, id_to_tag = load_vocab(args.tag_file)
    processing_word = get_processing_word(word_to_id)
    processing_tag = get_processing_word(tag_to_id, allow_unk=False)

    # load data sets
    train_sentences = LoadDataset(args.train_file, processing_word,
                                  processing_tag)
    dev_sentences = LoadDataset(args.dev_file, processing_word, processing_tag)
    test_sentences = LoadDataset(args.test_file, processing_word,
                                 processing_tag)

    # Use selected tagging scheme (IOB / IOBES)
    # update_tag_scheme(train_sentences, args.tag_schema)
    # update_tag_scheme(test_sentences, args.tag_schema)

    if os.path.isfile(args.config_file):
        config = load_config(args.config_file)
    else:
        config = config_model(word_to_id, tag_to_id, id_to_tag)
        save_config(config, args.config_file)

    make_path(args)
    log_path = os.path.join("log", args.log_file)
    logger = get_logger(log_path)

    with tf.Session() as sess:

        model = create_model(sess, Model, args.ckpt_path, load_word2vec,
                             config, logger)

        model.train(train_sentences, dev_sentences)
Ejemplo n.º 3
0
def main():
    # get config and processing of words
    config = Config(load=False)
    processing_word = get_processing_word(lowercase=True)
    processing_pos = get_processing_word()
    processing_chunk = get_processing_word()
    # Generators
    dev = CoNLLDataset(config.filename_dev, processing_word, processing_pos,
                       processing_chunk)
    test = CoNLLDataset(config.filename_test, processing_word, processing_pos,
                        processing_chunk)
    train = CoNLLDataset(config.filename_train, processing_word,
                         processing_pos, processing_chunk)

    # Build Word and Tag vocab
    vocab_words, vocab_tags, vocab_poses, vocab_chunks = get_vocabs(
        [train, dev, test])
    vocab_glove = get_glove_vocab(config.filename_glove)

    vocab = [i for i in vocab_words if i in vocab_glove]
    vocab.append(UNK)
    vocab.append(NUM)
    vocab.append("$pad$")
    vocab_poses.append("$pad$")
    vocab_chunks.append("$pad$")
    vocab_tags.append("$pad$")

    # Save vocab
    write_vocab(vocab, config.filename_words)
    write_vocab(vocab_tags, config.filename_tags)
    write_vocab(vocab_poses, config.filename_poses)
    write_vocab(vocab_chunks, config.filename_chunks)

    # Trim GloVe Vectors
    vocab = load_vocab(config.filename_words)
    print(len(vocab))
    export_trimmed_glove_vectors(vocab, config.filename_glove,
                                 config.filename_trimmed, config.dim_word)

    vocab = load_vocab(config.filename_poses)
    export_trimed_ont_hot_vectors(vocab, config.filename_pos_trimmed)

    vocab = load_vocab(config.filename_chunks)
    export_trimed_ont_hot_vectors(vocab, config.filename_chunk_trimmed)

    # Build and save char vocab
    train = CoNLLDataset(config.filename_train)
    vocab_chars = get_char_vocab(train)
    vocab_chars.append("$pad$")
    write_vocab(vocab_chars, config.filename_chars)
Ejemplo n.º 4
0
def main():
    # create instance of config,这里的config实现了load data的作用
    #拥有词表、glove训练好的embeddings矩阵、str->id的function
    config = Config()

    # build model
    model = NERModel(config)
    model.build("train")

    # model.restore_session("results/crf/model.weights/") # optional, restore weights
    # model.reinitialize_weights("proj")

    # create datasets [(char_ids), word_id]
    processing_word = get_processing_word(lowercase=True)
    dev = CoNLLDataset(config.filename_dev, processing_word)
    train = CoNLLDataset(config.filename_train, processing_word)
    test = CoNLLDataset(config.filename_test, processing_word)

    train4cl = CoNLLdata4classifier(train,
                                    processing_word=config.processing_word,
                                    processing_tag=config.processing_tag)
    dev4cl = CoNLLdata4classifier(dev,
                                  processing_word=config.processing_word,
                                  processing_tag=config.processing_tag)
    test4cl = CoNLLdata4classifier(test,
                                   processing_word=config.processing_word,
                                   processing_tag=config.processing_tag)

    # train model
    model.train(train4cl, dev4cl, test4cl)
Ejemplo n.º 5
0
def main():
    # create instance of config
    config = Config()

    # build model
    model = NERModel(config)
    model.build("train")
    model.restore_session(config.dir_model)

    # create dataset
    processing_word = get_processing_word(lowercase=True)

    if len(sys.argv) == 2:
        if sys.argv[1] == 'test':
            test = CoNLLDataset(config.filename_test, processing_word)

        elif sys.argv[1] == 'dev':
            test = CoNLLDataset(config.filename_dev, processing_word)

    else:
        assert len(sys.argv) == 1
        test = CoNLLDataset(config.filename_test, processing_word)

    test4cl = CoNLLdata4classifier(test, processing_word=config.processing_word,
                                   processing_tag=config.processing_tag)

    # evaluate and interact
    model.evaluate(test4cl)
Ejemplo n.º 6
0
def main():

    # get config and processing of words
    config = Config(load=False)
    processing_word = get_processing_word(lowercase=True)

    # Generators
    dev = CoNLLDataset(config.filename_dev, processing_word)
    test = CoNLLDataset(config.filename_test, processing_word)
    train = CoNLLDataset(config.filename_train, processing_word)

    # Build Word and Tag vocab
    vocab_words, vocab_tags = get_vocabs([train, dev, test])
    vocab_glove = get_glove_vocab(config.filename_glove)

    vocab = vocab_words & vocab_glove
    vocab.add(UNK)
    vocab.add(NUM)

    # Save vocab
    write_vocab(vocab, config.filename_words)
    write_vocab(vocab_tags, config.filename_tags)

    # Trim GloVe Vectors
    vocab = load_vocab(config.filename_words)
    export_trimmed_glove_vectors(vocab, config.filename_glove,
                                 config.filename_trimmed, config.dim_word)

    # Build and save char vocab
    train = CoNLLDataset(config.filename_train)
    vocab_chars = get_char_vocab(train)
    write_vocab(vocab_chars, config.filename_chars)
Ejemplo n.º 7
0
def main():

    config = Config()

    # model.restore_session("results/crf/model.weights/") # optional, restore weights
    # model.reinitialize_weights("proj")

    # create datasets [(char_ids), word_id]
    processing_word = get_processing_word(lowercase=False)
    dev = CoNLLDataset(config.filename_dev, processing_word)
    train = CoNLLDataset(config.filename_train, processing_word)
    test = CoNLLDataset(config.filename_test, processing_word)
    entities = []
    for raw_words, raw_tags in test:
        chunks = get_chunks_from_tags(raw_tags)
        for _, chunk_start, chunk_end in chunks:
            entity = 'ENTITY/'
            for i in range(chunk_start, chunk_end):
                if i == chunk_end - 1:
                    entity += raw_words[i]
                else:
                    entity = entity + raw_words[i] + '_'
            entities.append(entity)
    # print(len(entities))
    # print(entities)

    entities = set(entities)
    print(len(entities))
    vocab_glove = get_glove_vocab(config.filename_glove)
    print(len(entities & vocab_glove))
Ejemplo n.º 8
0
def main():
    """Procedure to build data

    You MUST RUN this procedure. It iterates over the whole dataset (train,
    dev and test) and extract the vocabularies in terms of words, tags, and
    characters. Having built the vocabularies it writes them in a file. The
    writing of vocabulary in a file assigns an id (the line #) to each word.
    It then extract the relevant GloVe vectors and stores them in a np array
    such that the i-th entry corresponds to the i-th word in the vocabulary.


    Args:
        config: (instance of Config) has attributes like hyper-params...

    """
    # get config and processing of words
    config = Config(load=False)
    processing_word = get_processing_word(lowercase=True)

    # Generators
    dev = CoNLLDataset(config.filename_dev, processing_word, task=config.task)
    test = CoNLLDataset(config.filename_test,
                        processing_word,
                        task=config.task)
    train = CoNLLDataset(config.filename_train,
                         processing_word,
                         task=config.task)

    # Build Word and Tag vocab
    vocab_words, vocab_tags = get_vocabs([train, dev, test])
    vocab_glove = get_glove_vocab(config.filename_glove)
    #TODO get word2vec vocab too

    vocab = vocab_words & vocab_glove
    vocab.add(UNK)
    vocab.add(NUM)

    # Save word and tag vocab
    write_vocab(vocab, config.filename_words)
    write_vocab(vocab_tags, config.filename_tags)

    # write and trim GloVe and word2vec Vectors
    vocab = load_vocab(config.filename_words)
    write_word2vec_to_txtfile(config.path_to_word2vec_bin_file,
                              config.filename_word2vec)
    export_trimmed_word2vec_vectors(vocab, config.filename_word2vec,
                                    config.trimmed_word2vec_filename,
                                    config.dim_word)

    export_trimmed_glove_vectors(vocab, config.filename_glove,
                                 config.trimmed_glove_filename,
                                 config.dim_word)

    # Build and save char vocab
    train = CoNLLDataset(config.filename_train)
    vocab_chars = get_char_vocab(train)
    write_vocab(vocab_chars, config.filename_chars)
Ejemplo n.º 9
0
def main():
    """Procedure to build data

    You MUST RUN this procedure. It iterates over the whole dataset (train,
    dev and test) and extract the vocabularies in terms of words, tags, and
    characters. Having built the vocabularies it writes them in a file. The
    writing of vocabulary in a file assigns an id (the line #) to each word.
    It then extract the relevant GloVe vectors and stores them in a np array
    such that the i-th entry corresponds to the i-th word in the vocabulary.


    Args:
        config: (instance of Config) has attributes like hyper-params...

    """
    # get config and processing of words
    config = Config(load=False)
    processing_word = get_processing_word(lowercase=True)  # 把字符全部小写,数字替换成NUM

    # Generators
    dev = CoNLLDataset(config.filename_dev,
                       processing_word)  # 创建一个生成器对象,每一次迭代产生tuple (words,tags)
    test = CoNLLDataset(config.filename_test,
                        processing_word)  # 返回一句话(words),和标签tags
    train = CoNLLDataset(config.filename_train, processing_word)

    #进一步处理数据

    # Build Word and Tag vocab
    vocab_words, vocab_tags = get_vocabs([train, dev, test])  # word词表, tags表
    print(len(vocab_words))

    vocab_glove = get_glove_vocab(config.filename_glove)  # glove词表

    vocab = vocab_words & vocab_glove  # & 求交集  set,都是集合
    vocab.add(UNK)
    vocab.add(NUM)  # 手动添加
    print("len of vocab without entity: ", len(vocab))

    # vocab_entity = entity2vocab(datasets=[train, dev, test])
    # vocab.update(vocab_entity)
    # vocab = entity2vocab(datasets=[train, dev], vocab=vocab)

    # Save vocab
    write_vocab(vocab, config.filename_words)
    write_vocab(vocab_tags, config.filename_tags)

    # Trim GloVe Vectors
    vocab = load_vocab(config.filename_words)  # 得到dict类型的vocab:{word:index}
    # 针对vocab,生成numpy的embedding文件,包含一个矩阵,对应词嵌入
    export_trimmed_glove_vectors(vocab, config.filename_glove,
                                 config.filename_trimmed, config.dim_word)

    # Build and save char vocab   生成字母表, 这里没用到小写化的东西。只有文件本身。
    train = CoNLLDataset(config.filename_train)
    vocab_chars = get_char_vocab(train)
    write_vocab(vocab_chars, config.filename_chars)
Ejemplo n.º 10
0
def main():
    """Procedure to build data

    You MUST RUN this procedure. It iterates over the whole dataset (train,
    dev and test.py) and extract the vocabularies in terms of words, tags, and
    characters. Having built the vocabularies it writes them in a file. The
    writing of vocabulary in a file assigns an id (the line #) to each word.
    It then extract the relevant GloVe vectors and stores them in a np array
    such that the i-th entry corresponds to the i-th word in the vocabulary.


    Args:
        config: (instance of Config) has attributes like hyper-params...

    """
    # get config and processing of words
    config = Config(load=False)
    processing_word = get_processing_word(lowercase=False)

    # Generators
    dev = CoNLLDataset(config.filename_dev, processing_word)
    # test.py  = CoNLLDataset(config.filename_test, processing_word)  后面需要吧测试集的 也加进来
    train = CoNLLDataset(config.filename_train, processing_word)

    # Build Word and Tag vocab
    vocab_words, vocab_tags = get_vocabs([train, dev])
    # 这里先不加 get_glove_vocab
    # vocab_glove = get_glove_vocab(config.filename_glove)

    # vocab = vocab_words & vocab_glove
    vocab = vocab_words

    vocab.add(UNK)
    vocab.add(PAD)
    vocab.add(NUM)

    # Save vocab
    write_vocab(vocab, config.filename_words)
    write_vocab(vocab_tags, config.filename_tags)

    # Trim GloVe Vectors
    # vocab = load_vocab(config.filename_words)
    # export_trimmed_glove_vectors(vocab, config.filename_glove,config.filename_trimmed, config.dim_word)

    # Build and save char vocab
    train = CoNLLDataset(config.filename_train)
    vocab_chars_train = get_char_vocab(train)

    dev = CoNLLDataset(config.filename_dev)
    vocab_chars_dev = get_char_vocab(dev)
    vocab_chars_train_dev = list(vocab_chars_dev & vocab_chars_train)
    vocab_chars = [UNK, PAD, NUM]
    vocab_chars.extend(vocab_chars_train_dev)

    write_vocab(vocab_chars, config.filename_chars)
Ejemplo n.º 11
0
def main():
    """Procedure to build data

    You MUST RUN this procedure. It iterates over the whole dataset (train,
    dev and test) and extract the vocabularies in terms of words and
    characters. Having built the vocabularies it writes them in a file. The
    writing of vocabulary in a file assigns an id (the line #) to each word.
    It then extract the relevant GloVe vectors and stores them in a np array
    such that the i-th entry corresponds to the i-th word in the vocabulary.


    Args:
        config: (instance of Config) has attributes like hyper-params...

    """
    # get config and processing of words
    config = Config(load=False)
    pw_function = get_processing_word(lowercase=True)

    # Generators
    dev = Dataset(config.filename_dev, processing_word=pw_function)
    test = Dataset(config.filename_test, processing_word=pw_function)
    train = Dataset(config.filename_train, processing_word=pw_function)

    # Build Words
    vocab_words = get_vocabs([train, dev, test])
    vocab_glove = get_glove_vocab(config.filename_glove)

    vocab = vocab_words & vocab_glove
    vocab = list(vocab)
    pronouns_in_vocab = move_pronouns(vocab)
    write_vocab(pronouns_in_vocab, config.filename_pronouns)

    # add START, STOP, PAD, UNK and NUM tokens into the list
    add_special_tokens(vocab)
    assert PAD_TOKEN == vocab[0]
    assert UNKNOWN_TOKEN in vocab

    # Save vocab
    write_vocab(vocab, config.filename_words)

    # Trim GloVe Vectors
    vocab, _ = load_vocab(config.filename_words)
    export_trimmed_glove_vectors(vocab, config.filename_glove,
                                 config.filename_trimmed, config.dim_word)

    # Build and save char vocab
    train = Dataset(config.filename_train)
    vocab_chars = get_char_vocab(train)
    vocab_chars = list(vocab_chars)
    vocab_chars.insert(0, PAD_TOKEN)
    write_vocab(vocab_chars, config.filename_chars)
Ejemplo n.º 12
0
def main():
    """Procedure to build data

    You MUST RUN this procedure. It iterates over the whole dataset (train,
    dev and test) and extract the vocabularies in terms of words, tags, and
    characters. Having built the vocabularies it writes them in a file. The
    writing of vocabulary in a file assigns an id (the line #) to each word.
    It then extract the relevant GloVe vectors and stores them in a np array
    such that the i-th entry corresponds to the i-th word in the vocabulary.


    Args:
        config: (instance of Config) has attributes like hyper-params...

    """
    # get config and processing of words
    config = Config(load=False)
    if config.task == 'pos':
        print("USING POS")
        config.filename_train = "data/train.pos"  # test
        config.filename_dev = "data/dev.pos"
        config.filename_test = "data/test.pos"
    else:
        print("USING NER")
    processing_word = get_processing_word(lowercase=True)

    # Generators
    dev = CoNLLDataset(config.filename_dev, processing_word)
    test = CoNLLDataset(config.filename_test, processing_word)
    train = CoNLLDataset(config.filename_train, processing_word)

    # Build Word and Tag vocab
    vocab_words, vocab_tags = get_vocabs([train, dev, test])
    vocab_glove = get_glove_vocab(config.filename_glove)

    vocab = vocab_words & vocab_glove
    vocab.add(UNK)
    vocab.add(NUM)

    # Save vocab
    write_vocab(vocab, config.filename_words)
    write_vocab(vocab_tags, config.filename_tags)

    # Trim GloVe Vectors
    vocab = load_vocab(config.filename_words)
    export_trimmed_glove_vectors(vocab, config.filename_glove,
                                 config.filename_trimmed, config.dim_word)

    # Build and save char vocab
    train = CoNLLDataset(config.filename_train)
    vocab_chars = get_char_vocab(train)
    write_vocab(vocab_chars, config.filename_chars)
Ejemplo n.º 13
0
def main():
    """Procedure to build data

    You MUST RUN this procedure. It iterates over the whole dataset (train,
    dev and test) and extract the vocabularies in terms of words, tags, and
    characters. Having built the vocabularies it writes them in a file. The
    writing of vocabulary in a file assigns an id (the line #) to each word.
    It then extract the relevant GloVe vectors and stores them in a np array
    such that the i-th entry corresponds to the i-th word in the vocabulary.


    Args:
        config: (instance of Config) has attributes like hyper-params...

    """
    # get config and processing of words
    config = Config(load=False)
    processing_word = get_processing_word(lowercase=True)

    # Generators
    dev = CoNLLDataset(config.filename_dev, processing_word)
    test = CoNLLDataset(config.filename_test, processing_word)
    train = CoNLLDataset(config.filename_train, processing_word)

    # Build Word and Tag vocab
    vocab_words, vocab_tags = get_vocabs([train, dev, test])
    vocab_glove = get_glove_vocab(config.filename_glove)

    # 与glove中的词集合求交,只保留有向量的那些词
    vocab = vocab_words & vocab_glove
    vocab.add(UNK)
    vocab.add(NUM)

    # Save vocab, vocab: set()
    print("write vocab set to file: " + config.filename_words)
    write_vocab(vocab, config.filename_words)
    print("write vocab tags set to file: " + config.filename_tags)
    write_vocab(vocab_tags, config.filename_tags)

    # Trim GloVe Vectors, 只加载那些在词集合中出现过的词向量
    vocab_to_index_dict = load_vocab(config.filename_words)
    # vocab: dict, vocab[word] = word_index
    print("export trimmed vocab embedding to file: " + config.filename_trimmed)
    export_trimmed_glove_vectors(vocab_to_index_dict, config.filename_glove,
                                 config.filename_trimmed, config.dim_word)

    # Build and save char vocab
    train = CoNLLDataset(config.filename_train)
    vocab_chars = get_char_vocab(train)
    print("save char set to file:" + config.filename_chars)
    write_vocab(vocab_chars, config.filename_chars)
Ejemplo n.º 14
0
def main():
    """Procedure to build data

    You MUST RUN this procedure. It iterates over the whole dataset (train,
    dev and test) and extract the vocabularies in terms of words, tags, and
    characters. Having built the vocabularies it writes them in a file. The
    writing of vocabulary in a file assigns an id (the line #) to each word.
    It then extract the relevant GloVe vectors and stores them in a np array
    such that the i-th entry corresponds to the i-th word in the vocabulary.


    Args:
        config: (instance of Config) has attributes like hyper-params...

    """
    # get config and processing of words
    config = Config(load=False)
    processing_word = get_processing_word(lowercase=True)

    # Generators
    dev = CoNLLDataset(config.filename_dev, processing_word)
    test = CoNLLDataset(config.filename_test, processing_word)
    train = CoNLLDataset(config.filename_train, processing_word)

    # Build Word and Tag vocab (only from train!)
    vocab_words, vocab_freqs, vocab_tags = get_vocabs([train])  #, dev, test])
    vocab_glove = get_glove_vocab(config.filename_glove)

    vocab = vocab_words & vocab_glove
    #vocab = make_unks(vocab, vocab_freqs, config.p_unk)
    #vocab.add(UNK)
    vocab.add(NUM)
    vocab = [UNK] + list(vocab)

    # Save vocab
    write_vocab(vocab, config.filename_words)
    write_vocab(vocab_tags, config.filename_tags)
    # get singletons
    singletons = [k for k, v in vocab_freqs.items() if v == 1]
    write_vocab(singletons, config.filename_singletons)

    # Trim GloVe Vectors
    vocab = load_vocab(config.filename_words)
    export_trimmed_glove_vectors(vocab, config.filename_glove,
                                 config.filename_trimmed, config.dim_word)

    # Build and save char vocab
    train = CoNLLDataset(config.filename_train)
    vocab_chars = get_char_vocab(train)
    write_vocab(vocab_chars, config.filename_chars)
Ejemplo n.º 15
0
def main():
    # get config and processing of words
    config = Config(load=False)
    # should be source_x.txt

    # or ontonotes-nw if you like

    config.filename_train = "../datasets/ritter2011/train"
    config.filename_dev = "../datasets/ritter2011/dev"
    config.filename_test = "../datasets/ritter2011/test"

    config.filename_chars = config.filename_chars.replace("source", "target")
    config.filename_glove = config.filename_glove.replace("source", "target")
    config.filename_tags = config.filename_tags.replace("source", "target")
    config.filename_words = config.filename_words.replace("source", "target")

    config.dir_model = config.dir_model.replace("source", "target")
    config.dir_output = config.dir_output.replace("source", "target")
    config.path_log = config.path_log.replace("source", "target")

    processing_word = get_processing_word(lowercase=True)

    # Generators
    dev = NERDataset(config.filename_dev, processing_word)
    test = NERDataset(config.filename_test, processing_word)
    train = NERDataset(config.filename_train, processing_word)

    # Build Word and Tag vocab
    vocab_words, vocab_tags = get_vocabs([train, dev, test])
    vocab_glove = get_glove_vocab(config.filename_glove)

    vocab = vocab_words & vocab_glove
    vocab.add(UNK)
    vocab.add(NUM)
    vocab_tags.add(UNK)

    # Save vocab
    write_vocab(vocab, config.filename_words)
    write_vocab(vocab_tags, config.filename_tags)

    # Trim Word Vectors
    vocab = load_vocab(config.filename_words)
    export_trimmed_glove_vectors(vocab, config.filename_glove,
                                 config.filename_trimmed, config.dim_word)

    # Build and save char vocab
    train = NERDataset(config.filename_train)
    vocab_chars = get_char_vocab(train)
    write_vocab(vocab_chars, config.filename_chars)
Ejemplo n.º 16
0
def build(config):
    """Procedure to build data

    You MUST RUN this procedure. It iterates over the whole dataset (train,
    dev ) and extract the vocabularies in terms of words, tags, and
    characters. Having built the vocabularies it writes them in a file. The
    writing of vocabulary in a file assigns an id (the line #) to each word.
    It then extract the relevant GloVe vectors and stores them in a np array
    such that the i-th entry corresponds to the i-th word in the vocabulary.


    Args:
        config: (instance of Config) has attributes like hyper-params...

    """
    # get config and processing of words
    # config = Config(load=False, args=args)
    processing_word = get_processing_word(lowercase=True)

    # Generators
    train = CoNLLDataset(config.filename_train, processing_word)

    vocab, _ = get_vocabs([train], config.min_count)
    vocab.insert(0, UNK)

    special_flag = [NUM, NUU, FLT, FLU]
    for index, flag in enumerate(special_flag, 1):
        if flag in vocab:
            vocab.remove(flag)
        vocab.insert(index, flag)

    # Generators
    dev = CoNLLDataset(config.filename_dev, processing_word)
    # test = CoNLLDataset(config.filename_test, processing_word)
    train = CoNLLDataset(config.filename_train, processing_word)

    # Build Word and Tag vocab
    _, vocab_tags = get_vocabs([train, dev])

    # Save vocab
    write_vocab(vocab, config.filename_words)
    write_vocab(vocab_tags, config.filename_tags)

    # Build and save char vocab
    train = CoNLLDataset(config.filename_train)
    vocab_chars = get_char_vocab(train)
    vocab_chars.insert(0, UNK)

    write_vocab(vocab_chars, config.filename_chars)
Ejemplo n.º 17
0
def get_vocabs_from_dataset(dataset):

    filename_train = "../datasets/%s/train_bioes"%datasets[dataset]
    filename_dev = "../datasets/%s/dev_bioes"%datasets[dataset]
    filename_test = "../datasets/%s/test_bioes"%datasets[dataset]


    processing_word = get_processing_word(lowercase=True)
    # Generators
    dev   = NERDataset(filename_dev, processing_word)
    test  = NERDataset(filename_test, processing_word)
    train = NERDataset(filename_train, processing_word)

    vocab_words, vocab_tags = get_vocabs([train, dev, test])
    return vocab_words, vocab_tags
def main():
    """Procedure to build data

    You MUST RUN this procedure. It iterates over the whole dataset (train,
    dev and test) and extract the vocabularies in terms of words, tags, and
    characters. Having built the vocabularies it writes them in a file. The
    writing of vocabulary in a file assigns an id (the line #) to each word.
    It then extract the relevant GloVe vectors and stores them in a np array
    such that the i-th entry corresponds to the i-th word in the vocabulary.


    Args:
        config: (instance of Config) has attributes like hyper-params...

    """
    # get config and processing of words
    config = Config(load=False)
    processing_word = get_processing_word(lowercase=True)  # 把字符全部小写,数字替换成NUM

    # Generators

    to_be_add = CoNLLDataset1(config.filename_test,
                              processing_word)  # 返回一句话(words),和标签tags

    # Build Word and Tag vocab

    vocab_words, _ = get_vocabs([to_be_add])
    vocab_glove = get_glove_vocab(config.filename_glove)  # glove词表

    words_have_vec = vocab_words & vocab_glove

    vocab_words_and_entity = entity2vocab(datasets=[to_be_add],
                                          vocab=words_have_vec)

    vocab_in_file = set(load_vocab(config.filename_words))

    vocab_words_to_be_add = vocab_words_and_entity - vocab_in_file

    if len(vocab_words_to_be_add) != 0:
        with open(config.filename_words, 'a') as f:
            for i, vocab_word in enumerate(vocab_words_to_be_add):
                f.write('\n{}'.format(vocab_word))

    # Trim GloVe Vectors
    vocab = load_vocab(config.filename_words)  # 得到dict类型的vocab:{word:index}
    # 针对vocab,生成numpy的embedding文件,包含一个矩阵,对应词嵌入
    export_trimmed_glove_vectors(vocab, config.filename_glove,
                                 config.filename_trimmed, config.dim_word)
Ejemplo n.º 19
0
def main():
    """Procedure to build data

    You MUST RUN this procedure. It iterates over the whole dataset (train,
    dev and test) and extract the vocabularies in terms of words, tags, and
    characters. Having built the vocabularies it writes them in a file. The
    writing of vocabulary in a file assigns an id (the line #) to each word.
    It then extract the relevant GloVe vectors and stores them in a np array
    such that the i-th entry corresponds to the i-th word in the vocabulary.


    Args:
        config: (instance of Config) has attributes like hyper-params...

    """
    # get config and processing of words
    config = Config(load=False)
    processing_word = get_processing_word(lowercase=True)

    # Generators
    dev   = CoNLLDataset(config.filename_dev, processing_word)
    test  = CoNLLDataset(config.filename_test, processing_word)
    train = CoNLLDataset(config.filename_train, processing_word)

    # Build Word and Tag vocab
    vocab_words, vocab_tags = get_vocabs([train, dev, test])
    vocab_glove = get_glove_vocab(config.filename_glove)

    vocab = vocab_words & vocab_glove
    vocab.add(UNK)
    vocab.add(NUM)

    # Save vocab
    write_vocab(vocab, config.filename_words)
    write_vocab(vocab_tags, config.filename_tags)

    # Trim GloVe Vectors
    vocab = load_vocab(config.filename_words)
    export_trimmed_glove_vectors(vocab, config.filename_glove,
                                config.filename_trimmed, config.dim_word)

    # Build and save char vocab
    train = CoNLLDataset(config.filename_train)
    vocab_chars = get_char_vocab(train)
    write_vocab(vocab_chars, config.filename_chars)
Ejemplo n.º 20
0
def generate_model_data(data_prefix=None):
    """Procedure to build data

    You MUST RUN this procedure. It iterates over the whole dataset (train,
    dev and test) and extract the vocabularies in terms of words, tags, and
    characters. Having built the vocabularies it writes them in a file. The
    writing of vocabulary in a file assigns an id (the line #) to each word.
    It then extract the relevant GloVe vectors and stores them in a np array
    such that the i-th entry corresponds to the i-th word in the vocabulary.


    Args:
        config: (instance of Config) has attributes like hyper-params...

    """

    # get config and processing of words
    # loads PubMeda articles
    config = Config()
    print('Config')
    processing_word = get_processing_word(lowercase=True)
    print('Processing_word')

    # Generators
    if data_prefix:
        cwd = os.getcwd()
        config.filename_dev = os.path.join(
            cwd, 'data',
            data_prefix + '_' + os.path.basename(config.filename_dev))
        config.filename_test = os.path.join(
            cwd, 'data',
            data_prefix + '_' + os.path.basename(config.filename_test))
        config.filename_train = os.path.join(
            cwd, 'data',
            data_prefix + '_' + os.path.basename(config.filename_train))

    if not os.path.isfile(config.filename_dev):
        print('Preprocessing tokens and labels to generate input data files')
        preprocess_data()

    dev = CoNLLDataset(config.filename_dev)
    test = CoNLLDataset(config.filename_test)
    train = CoNLLDataset(config.filename_train)
    print('Loaded dev, test, train')
Ejemplo n.º 21
0
def main():
    """Procedure to build data

    You MUST RUN this procedure. It iterates over the whole dataset (train,
    dev and test) and extract the vocabularies in terms of words, tags, and
    characters. Having built the vocabularies it writes them in a file. The
    writing of vocabulary in a file assigns an id (the line #) to each word.
    It then extract the relevant GloVe vectors and stores them in a np array
    such that the i-th entry corresponds to the i-th word in the vocabulary.


    Args:
        config: (instance of Config) has attributes like hyper-params...

    """
    # get config and processing of words
    config = Config(parser, load=False)
    processing_word = get_processing_word(lowercase=True)

    # Generators
    dev = Dataset(config.filename_dev, processing_word)
    test = Dataset(config.filename_test, processing_word)
    train = Dataset(config.filename_train, processing_word)

    # Build Word and Tag vocab
    vocab_words, vocab_tags = get_vocabs([train, dev, test])
    # vocab_glove = get_wordvec_vocab(config.filename_wordvec)

    # vocab = vocab_words & vocab_glove
    vocab = list(vocab_words)
    vocab.insert(0, UNK)
    vocab.append(NUM)

    # Save vocab
    write_vocab(vocab, config.filename_words)
    write_vocab(vocab_tags, config.filename_tags)
    print('Wrote vocab')
    # Trim GloVe Vectors
    vocab = load_vocab(config.filename_words)
    export_trimmed_wordvec_vectors(vocab, config.filename_wordvec,
                                   config.filename_wordvec_trimmed)

    print('trimmed vocab')
def main():
    """Procedure to build data

    You MUST RUN this procedure. It iterates over the whole dataset (train,
    dev and test) and extract the vocabularies in terms of words, tags, and
    characters. Having built the vocabularies it writes them in a file. The
    writing of vocabulary in a file assigns an id (the line #) to each word.
    It then extract the relevant GloVe vectors and stores them in a np array
    such that the i-th entry corresponds to the i-th word in the vocabulary.


    Args:
        config: (instance of Config) has attributes like hyper-params...

    """
    # get config and processing of words
    config = Config()
    processing_word = get_processing_word(lowercase=False)

    # Generators
    dev = LoadDataset(config.filename_dev, processing_word)
    test = LoadDataset(config.filename_test, processing_word)
    train = LoadDataset(config.filename_train, processing_word)

    # Build Word and Tag vocab
    vocab_words, vocab_tags = get_vocabs([train, dev, test])
    vocab_glove = get_embeddings_vocab(config.filename_glove, dim=300)

    vocab = vocab_words | vocab_glove
    vocab = list(vocab)

    vocab.insert(0, PAD)
    vocab.insert(1, UNK)

    # Save vocab
    write_vocab(vocab, config.filename_words)
    write_vocab(vocab_tags, config.filename_tags)

    # Trim GloVe Vectors
    word_to_id, _ = load_vocab(config.filename_words)

    save_word2vec(word_to_id, config.filename_glove, config.filename_trimmed,
                  config.dim_word)
Ejemplo n.º 23
0
def main():
    # create instance of config,这里的config实现了load data的作用
    #拥有词表、glove训练好的embeddings矩阵、str->id的function
    config = Config()
    config.nepochs = 200
    config.dropout = 0.5
    config.batch_size = 20
    config.lr_method = "adam"
    config.lr = 0.0001
    config.lr_decay = 1.0
    config.clip = -2.0  # if negative, no clipping
    config.nepoch_no_imprv = 5

    config.dir_model = config.dir_output + "model.finetuning.weights/"

    # build model
    model = NERModel(config)
    model.build("fine_tuning")
    model.restore_session("results/test/model.weights/",
                          indicate="fine_tuning")

    # model.restore_session("results/crf/model.weights/") # optional, restore weights
    # model.reinitialize_weights("proj")

    # create datasets [(char_ids), word_id]
    processing_word = get_processing_word(lowercase=True)
    dev = CoNLLDataset(config.filename_dev, processing_word)
    train = CoNLLDataset(config.filename_train, processing_word)
    test = CoNLLDataset(config.filename_test, processing_word)

    # train model

    train4cl = CoNLLdata4classifier(train,
                                    processing_word=config.processing_word,
                                    processing_tag=config.processing_tag)
    dev4cl = CoNLLdata4classifier(dev,
                                  processing_word=config.processing_word,
                                  processing_tag=config.processing_tag)
    test4cl = CoNLLdata4classifier(test,
                                   processing_word=config.processing_word,
                                   processing_tag=config.processing_tag)

    model.train(train4cl, dev4cl, test4cl)
Ejemplo n.º 24
0
def main():
    config = Config(load=False)
    processing_word = data_utils.get_processing_word(lowercase=True)

    #Datasets
    test = Dataset(config.filename_test, processing_word=processing_word)
    dev = Dataset(config.filename_dev, processing_word=processing_word)
    train = Dataset(config.filename_train, processing_word=processing_word)

    # Vocab Generators
    vocab_words, vocab_tags = get_vocabs([train, dev, test])
    vocab_fasttext = get_fasttext_vocab(config.filename_fasttext)

    #Build Word and Tag Vocab
    if config.use_fasttext_oov_vector_gen:
        vocab = vocab_words
    else:
        vocab = vocab_words & vocab_fasttext
    vocab.add(UNK)
    vocab.add(NUM)

    oov_words = vocab_words - vocab_fasttext
    generate_fasttext_oov_vectors(oov_words, config.filename_oov_words,
                                  config.filename_oov_result_vectors)

    # Save vocab
    write_vocab(vocab, config.filename_words)
    write_vocab(vocab_tags, config.filename_tags)

    #Trim and (insert new) Fasttext vectors
    word_to_idx, idx_to_word = load_vocab(config.filename_words)
    export_trimmed_fasttext_vectors(word_to_idx, idx_to_word,
                                    config.filename_fasttext,
                                    config.filename_fasttext_trimmed,
                                    config.dim_word,
                                    config.filename_oov_result_vectors,
                                    config.use_fasttext_oov_vector_gen)

    # Build and save char vocab
    train = Dataset(config.filename_train)
    vocab_chars = get_char_vocab(train)
    write_vocab(vocab_chars, config.filename_chars)
Ejemplo n.º 25
0
def main():
    # get config and processing of words
    config = Config(load=False)
    # should be source_x.txt

    # or ontonotes-nw if you like

    config.filename_train = "../datasets/ontonotes-nw/train"
    config.filename_dev = "../datasets/ontonotes-nw/dev"
    config.filename_test = "../datasets/ontonotes-nw/test"

    processing_word = get_processing_word(lowercase=True)

    # Generators
    dev = NERDataset(config.filename_dev, processing_word)
    test = NERDataset(config.filename_test, processing_word)
    train = NERDataset(config.filename_train, processing_word)
    #for word, tag in train:
    #print("word:{}".format(word))
    #print ("tag:{}".format(tag))
    # Build Word and Tag vocab
    vocab_words, vocab_tags = get_vocabs([train, dev, test])
    vocab_glove = get_glove_vocab(config.filename_glove)

    vocab = vocab_words & vocab_glove
    vocab.add(UNK)
    vocab.add(NUM)
    vocab_tags.add(UNK)
    # Save vocab
    write_vocab(vocab, config.filename_words)
    write_vocab(vocab_tags, config.filename_tags)

    # Trim Word Vectors
    vocab = load_vocab(config.filename_words)
    export_trimmed_glove_vectors(vocab, config.filename_glove,
                                 config.filename_trimmed, config.dim_word)

    # Build and save char vocab
    train = NERDataset(config.filename_train)
    vocab_chars = get_char_vocab(train)
    write_vocab(vocab_chars, config.filename_chars)
Ejemplo n.º 26
0
def main():
    """Procedure to build data

    You MUST RUN this procedure. It iterates over the whole dataset (train, dev
    and test) and extract the vocabularies in terms of words, tags. Having built
    the vocabularies it writes them in a file. The writing of vocabulary in a
    file assigns an id (the line #) to each word. It then extract the relevant
    polyglot vectors and stores them in a np array such that the i-th entry
    corresponds to the i-th word in the vocabulary.


    Args:
        config: (instance of Config) has attributes like hyper-params...

    """
    # get config and processing of words
    config = Config(load=False)
    processing_word = get_processing_word()

    # Generators
    dev = getDataset(config.filename_dev, processing_word)
    test = getDataset(config.filename_test, processing_word)
    train = getDataset(config.filename_train, processing_word)

    # Build Word and Tag vocab
    vocab_words, vocab_tags = get_vocabs([train, dev, test])
    vocab_poly = get_polyglot_vocab(config.filename_polyglot)

    # Get common vocab
    vocab = vocab_words & vocab_poly
    vocab.add(UNK)

    # Save vocab
    write_vocab(vocab, config.filename_words)
    write_vocab(vocab_tags, config.filename_tags)

    # Trim Polygloe Vectors
    vocab = load_vocab(config.filename_words)
    export_trimmed_polyglot_vectors(vocab, config.filename_polyglot, \
                                  config.filename_trimmed, config.dim)
Ejemplo n.º 27
0
def main():
    """Procedure to build data

    This procedure iterates over the SemEval dataset and builds a vocabulary 
    of words and tags, then writes them to a file. Each word is labelled by 
    an ID. The GloVe vectors of the words are then extracted and stored
    in a numpy array. The word id is used to index into that numpy array.

    """
    # get config and processing of words
    config = Config(load=False)
    processing_word = get_processing_word(lowercase=True)

    # Generators for the dev, test and training files
    dev = GloveDataset(config.filename_dev, processing_word)
    test = GloveDataset(config.filename_test, processing_word)
    train = GloveDataset(config.filename_train, processing_word)

    # Build Word and Tag vocab
    vocab_words, vocab_tags = get_vocabs([train, dev, test])
    vocab_glove = get_glove_vocab(config.filename_glove)

    #find the intersection between the vocabs from the chosen dataset and GloVe
    vocab = vocab_words & vocab_glove
    #adds the unknown and numeric value to the vocab
    vocab.add(UNK)
    vocab.add(NUM)

    # Save vocab
    write_vocab(vocab, config.filename_words)
    write_vocab(vocab_tags, config.filename_tags)

    # export the trimmed glove vectors in a compressed file.
    vocab = load_vocab(config.filename_words)
    export_trimmed_glove_vectors(vocab, config.filename_glove,
                                 config.filename_trimmed, config.dim_word)
Ejemplo n.º 28
0
def generate_model_data(data_prefix=None):
    """Procedure to build data

    You MUST RUN this procedure. It iterates over the whole dataset (train,
    dev and test) and extract the vocabularies in terms of words, tags, and
    characters. Having built the vocabularies it writes them in a file. The
    writing of vocabulary in a file assigns an id (the line #) to each word.
    It then extract the relevant GloVe vectors and stores them in a np array
    such that the i-th entry corresponds to the i-th word in the vocabulary.


    Args:
        config: (instance of Config) has attributes like hyper-params...

    """

    # get config and processing of words
    # loads PubMeda articles
    config = Config(load=False)
    print('Config')
    processing_word = get_processing_word(lowercase=True)
    print('Processing_word')

    # Generators
    if data_prefix:
        cwd = os.getcwd()
        config.filename_dev = os.path.join(
            cwd, 'data',
            data_prefix + '_' + os.path.basename(config.filename_dev))
        config.filename_test = os.path.join(
            cwd, 'data',
            data_prefix + '_' + os.path.basename(config.filename_test))
        config.filename_train = os.path.join(
            cwd, 'data',
            data_prefix + '_' + os.path.basename(config.filename_train))

    if not os.path.isfile(config.filename_dev):
        print('Preprocessing tokens and labels to generate input data files')
        preprocess_data()

    dev = CoNLLDataset(config.filename_dev, processing_word)
    test = CoNLLDataset(config.filename_test, processing_word)
    train = CoNLLDataset(config.filename_train, processing_word)
    print('Loaded dev, test, train')

    # Build Word and Tag vocab
    vocab_words, vocab_tags = get_vocabs([train, dev, test])
    print('Loading vocab_words')
    vocab_glove = get_glove_vocab(config.filename_glove)

    vocab = vocab_words & vocab_glove
    vocab.add(UNK)
    vocab.add(NUM)

    # Save vocab
    write_vocab(vocab, config.filename_words)
    write_vocab(vocab_tags, config.filename_tags)

    # Trim GloVe Vectors
    vocab = load_vocab(config.filename_words)
    export_trimmed_glove_vectors(vocab, config.filename_glove,
                                 config.filename_trimmed, config.dim_word)

    # Build and save char vocab
    train = CoNLLDataset(config.filename_train)
    vocab_chars = get_char_vocab(train)
    write_vocab(vocab_chars, config.filename_chars)
def main():
    # create instance of config
    config = Config()
    #


    # 1. Load previous vocab words
    old_vocab = set()
    with open(config.filename_words) as f:
        for word in f:
            #print(word)
            old_vocab.add(word.strip())
    print("Number of old vocabs = ", len(old_vocab))

    # Load new vocab and check for words in new vocab that is not in old vocab
    processing_word = get_processing_word(lowercase=True)
    dev   = CoNLLDataset(config.filename_dev, processing_word)
    test  = CoNLLDataset(config.filename_test, processing_word)

    vocab_words, vocab_tags = get_vocabs([dev, test])

    # Get vocab in new dataset that is not in old vocab
    vocab_new = vocab_words - old_vocab
    print("Number of new words: ", len(vocab_new))

    # Get full glove vocab
    vocab_glove = get_glove_vocab(config.filename_glove)

    # Get vocab set for words in new vocab and in glove_vocab
    vocab = vocab_new & vocab_glove
    print("Final number of additions are: ", len(vocab))

    # Load old model
    model = BLSTMCRF(config)
    model.build()
    model.summary()
    model.load_weights('./saves/less_words.h5')
    embedding_weights = model.get_layer(name="word_embeddings").get_weights()[0]
    print(embedding_weights.shape)


    def create_embedding_dict(glove_dir, dim_size):
        print("Creating embedding dictionary...")
        embeddings_index = {}
        f = open(glove_dir, encoding='utf-8')
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
        f.close()

        print('Found %s word vectors.' % len(embeddings_index))
        return embeddings_index

    embeddings_index = create_embedding_dict(config.filename_glove, config.dim_word)

    # Create new embedding size
    embeddings = np.zeros([embedding_weights.shape[0]+len(vocab), embedding_weights.shape[1]])
    # Load old vectors
    for idx, vec in enumerate(embedding_weights):
        embeddings[idx] = vec
    # Load new vectors
    pt = embedding_weights.shape[0]
    for idx, word in enumerate(vocab):
        embeddings[idx+pt] = embeddings_index.get(word)
    print("Size of new embeddings: ", embeddings.shape)
    # Save embeddings to npz
    np.savez_compressed(config.filename_trimmed, embeddings=embeddings)

    # Write new vocab file for new config
    def append_vocab(vocab, filename):
        """Writes a vocab to a file

        Writes one word per line.

        Args:
            vocab: iterable that yields word
            filename: path to vocab file

        Returns:
            write a word per line

        """
        print("Writing vocab...")
        with open(filename, "a") as f:
            f.write("\n")
            for i, word in enumerate(vocab):
                if i != len(vocab) - 1:
                    f.write("{}\n".format(word))
                else:
                    f.write(word)
        print("- done. {} tokens".format(len(vocab)))
    append_vocab(vocab, config.filename_words)

    # Build new model
    config2 = Config()
    model2 = BLSTMCRF(config2)
    model2.build()
    model2.summary()

    layer_names = ["char_embeddings", "fw_char_lstm", "bw_char_lstm", "bidirectional", "crf"]

    # Set other weights
    for layer_name in layer_names:
        if layer_name == "crf":
            model2.get_layer(name="crf_2").set_weights(model.get_layer(name="crf_1").get_weights())
        else:
            model2.get_layer(name=layer_name).set_weights(model.get_layer(name=layer_name).get_weights())

    # Set embedding weights
    #model2.get_layer(name="word_embeddings").set_weights([embeddings])
    model2.summary()
    model2.save_weights('./saves/WEWWWWW.h5')
Ejemplo n.º 30
0
        filename_tags,
        filename_chars,
        filename_word="../pretrained_vectors/vecs_{}.txt",
        filename_word_vec_trimmed="../pretrained_vectors/vecs_{}.trimmed.npz",
        which_tags=which_tags)

    vocab_words = load_vocab(filename_words)
    vocab_tags = load_vocab(filename_tags)
    vocab_chars = load_vocab(filename_chars)
    nwords = len(vocab_words)
    nchars = len(vocab_chars)
    ntags = len(vocab_tags)

    # load data
    processing_word = get_processing_word(vocab_words,
                                          vocab_chars,
                                          lowercase=True,
                                          chars=use_chars)
    processing_tag = get_processing_word(vocab_tags,
                                         lowercase=False,
                                         allow_unk=False)
    X_dev, y_dev = coNLLDataset_full(filename_dev, processing_word,
                                     processing_tag, max_iter, which_tags)
    X_train, y_train = coNLLDataset_full(filename_train, processing_word,
                                         processing_tag, max_iter, which_tags)
    X_valid, y_valid = coNLLDataset_full(filename_test, processing_word,
                                         processing_tag, max_iter, which_tags)

    print("Size of train, test and valid sets (in number of sentences): ")
    print(len(X_train), " ", len(y_train), " ", len(X_dev), " ", len(y_dev),
          " ", len(X_valid), " ", len(y_valid))
Ejemplo n.º 31
0
def main():
    """Procedure to build data

    You MUST RUN this procedure. It iterates over the whole dataset (train,
    dev and test) and extract the vocabularies in terms of words, tags, and
    characters. Having built the vocabularies it writes them in a file. The
    writing of vocabulary in a file assigns an id (the line #) to each word.
    It then extract the relevant GloVe vectors and stores them in a np array
    such that the i-th entry corresponds to the i-th word in the vocabulary.


    Args:
        config: (instance of Config) has attributes like hyper-params...

    """
    
    if len(sys.argv)<2:
        sys.stderr.write("Too few arguments have been specified\n")
        sys.stderr.write("python "+sys.argv[0]+" config [additional vocabulary in conll format]\n")
        sys.exit(0)    
    # get config and processing of words
    config_file = sys.argv[1]
    
    config = Config(config_file,load=False)
    processing_word = get_processing_word(config)
#    processing_word = get_processing_word(lowercase=config.lowercase)

    # Generators
    dev   = CoNLLDataset(config.filename_dev, processing_word)
    test  = CoNLLDataset(config.filename_test, processing_word)
    train = CoNLLDataset(config.filename_train, processing_word)
    

    # Build Word and Tag vocab
    vocab_words, vocab_tags = get_vocabs([train, dev, test])
    #add additional tags/vocabulary where the data is applied to!
    if len(sys.argv)>2:
        for i in range(2,len(sys.argv)):
            wo,tg = get_vocabs([CoNLLDataset(sys.argv[i],processing_word)])
            vocab_words |=  wo
            vocab_tags |=  tg
    #if config.use_pretrained:
    #    vocab_glove = get_vocab(config.filename_embeddings)
    #if config.use_pretrained:
    #    vocab = vocab_words & vocab_glove
    #else:
    vocab = vocab_words
    vocab.add(UNK)

    vocab.add(NUM)
    # Save vocab
    write_vocab(vocab, config.filename_words)
    write_vocab(vocab_tags, config.filename_tags)
    
    # Trim GloVe Vectors
    vocab = load_vocab(config.filename_words)

    if config.use_pretrained:
        export_trimmed_embedding_vectors(vocab, config.filename_embeddings,
                                config.filename_embeddings_trimmed, config.dim_word, config.embedding_type)

    # Build and save char vocab
    train = CoNLLDataset(config.filename_train)
    vocab_chars = get_char_vocab(train)
    write_vocab(vocab_chars, config.filename_chars)