Beispiel #1
0
def build_data(config):
    """
    Procedure to build data

    Args:
        config: defines attributes needed in the function
    Returns:
        creates vocab files from the datasets
        creates a npz embedding file from trimmed glove vectors
    """
    processing_word = get_processing_word(lowercase=True)

    # Generators
    dev = CoNLLDataset(config.dev_filename, processing_word)
    #test  = CoNLLDataset(config.test_filename, processing_word)
    train = CoNLLDataset(config.train_filename, processing_word)

    # Build Word and Tag vocab
    vocab_words, vocab_tags, vocab_pos = get_vocabs([train, dev])
    vocab_glove = get_glove_vocab(config.glove_filename)
    vocab_glove_uni = get_glove_vocab(config.glove_uni_filename)

    vocab_feature = get_pos_glove_vocab(config.glove_filename)

    # vocab = vocab_words & vocab_glove
    vocab = vocab_glove | vocab_words
    vocab.add(UNK)
    vocab.add(NUM)

    vocab_pos = vocab_feature
    vocab_pos.add(UNK)
    vocab_pos.add(NUM)

    # Save vocab
    write_vocab(vocab, config.words_filename)
    write_vocab(vocab_glove_uni, config.uni_words_filename)
    write_vocab(vocab_tags, config.tags_filename)
    write_vocab(vocab_pos, config.pos_filename)

    # Trim GloVe Vectors
    vocab = load_vocab(config.words_filename)
    export_trimmed_glove_vectors(vocab, config.glove_filename,
                                 config.trimmed_filename, config.t_dim)

    vocab = load_vocab(config.uni_words_filename)

    export_trimmed_uni_vectors(vocab, config.NEdic_filename,
                               config.trimmed_dic, config.dic_dim)

    export_trimmed_uni_vectors(vocab, config.glove_uni_filename,
                               config.uni_trimmed_filename, config.dim)

    vocab_feature = load_vocab(config.pos_filename)
    export_trimmed_pos_vectors(vocab_feature, config.glove_feature,
                               config.feature_trimmed_filename, config.pos_dim)

    # Build and save char vocab
    train = CoNLLDataset(config.train_filename)
    vocab_chars = get_char_vocab(train)
    write_vocab(vocab_chars, config.chars_filename)
Beispiel #2
0
def build_joint_vocab(config):

    # Common options for all datasets
    processing_word = get_processing_word(lowercase=True)
    vocab_glove = get_glove_vocab(config.filename_glove)

    # Compute and save individual vocab
    v1_words, v1_chars = get_conll2005_vocab(config.conll2005, processing_word,
                                             vocab_glove)
    v2_words, v2_chars = get_conll2003_vocab(config.conll2003, processing_word,
                                             vocab_glove)
    v3_words, v3_chars = get_semcor_vocab(config.semcor, processing_word,
                                          vocab_glove)

    print(" *** Joint vocabulary ***")
    vocab_words = v1_words.union(v2_words, v3_words)
    vocab_chars = v1_chars.union(v2_chars, v3_chars)

    # Save combined vocab
    write_vocab(vocab_words, config.filename_words)
    write_vocab(vocab_chars, config.filename_chars)

    # Trim GloVe Vectors
    vocab = load_vocab(config.filename_words)
    export_trimmed_glove_vectors(vocab, config.filename_glove,
                                 config.filename_trimmed, config.dim_word)
Beispiel #3
0
def build_data(config):
    """
    Procedure to build data
    Args:
        config: defines attributes needed in the function
    Returns:
        creates vocab files from the datasets
        creates a npz embedding file from trimmed glove vectors
    """
    processing_word = get_processing_word(lowercase=config.lowercase)

    # Generators
    dev = CoNLLDataset(config.dev_filename, processing_word)
    test = CoNLLDataset(config.test_filename, processing_word)
    train = CoNLLDataset(config.train_filename, processing_word)

    # Build Word and Tag vocab
    vocab_words, vocab_tags = get_vocabs([train, dev, test])
    vocab_glove = get_glove_vocab(config.glove_filename)

    vocab = vocab_words & vocab_glove
    vocab.add(UNK)
    vocab.add(NUM)

    # Save vocab
    write_vocab(vocab, config.words_filename)
    write_vocab(vocab_tags, config.tags_filename)

    # Trim GloVe Vectors
    vocab = load_vocab(config.words_filename)
    export_trimmed_glove_vectors(vocab, config.glove_filename,
                                 config.trimmed_filename, config.dim)
Beispiel #4
0
def get_conll2005_vocab(config, processing_word, embedding_vocab):
    print("*** CoNLL-2005 vocabulary ***")
    # Generators
    train = CoNLL2005Dataset(config.filename_train, processing_word)
    dev = CoNLL2005Dataset(config.filename_dev, processing_word)
    test_wsj = CoNLL2005Dataset(config.filename_test_wsj, processing_word)
    test_brown = CoNLL2005Dataset(config.filename_test_brown, processing_word)

    vocab_all_words, vocab_tags = get_vocabs(
        [train, dev, test_wsj, test_brown])
    vocab_words = vocab_all_words & embedding_vocab
    unk_words = vocab_all_words - embedding_vocab
    vocab_words = _add_vocab_constants(vocab_words)

    # Re-create Generators for char, with no processing
    train = CoNLL2005Dataset(config.filename_train)
    dev = CoNLL2005Dataset(config.filename_dev)
    test_wsj = CoNLL2005Dataset(config.filename_test_wsj)
    test_brown = CoNLL2005Dataset(config.filename_test_brown)

    vocab_chars = get_char_vocab([train, dev, test_wsj, test_brown])

    # Save vocab
    write_vocab(vocab_chars, config.filename_chars)
    write_vocab(vocab_words, config.filename_words)
    write_vocab(unk_words, "{}.unknown".format(config.filename_words))
    write_vocab(vocab_tags, config.filename_tags)

    return vocab_words, vocab_chars
Beispiel #5
0
def build_data(config):
    processing_word = get_processing_word()

    dev = CoNLLDataset(config.dev_filename, processing_word)
    test = CoNLLDataset(config.test_filename, processing_word)
    train = CoNLLDataset(config.train_filename, processing_word)

    vocab_words, vocab_tags, vocab_poss = get_vocabs([train, dev, test])
    vocab_glove = get_glove_vocab(config.glove_filename)

    vocab = vocab_words & vocab_glove
    vocab.add(UNK)
    vocab.add(NUM)

    write_vocab(vocab, config.words_filename)
    write_vocab(vocab_tags, config.tags_filename)
    write_vocab(vocab_poss, config.poss_filename)

    vocab = load_vocab(config.words_filename)
    export_trimmed_glove_vectors(vocab, config.glove_filename,
                                 config.trimmed_filename, config.dim)

    train = CoNLLDataset(config.train_filename)
    vocab_chars = get_char_vocab(train)
    write_vocab(vocab_chars, config.chars_filename)
Beispiel #6
0
def test_dataset():
    # test getDataset and get_vocabs
    processing_word = get_processing_word()
    dev = getDataset("../data/test_ner.txt", processing_word)
    vocab_words, vocab_tags = get_vocabs([dev])

    # get common vocab from dev file and polyglot
    vocab_poly = get_polyglot_vocab("../data/polyglot-zh.pkl")
    vocab = vocab_words & vocab_poly
    vocab.add(UNK)

    write_vocab(vocab, "../data/words.txt")
    write_vocab(vocab_tags, "../data/tags.txt")

    vocab = load_vocab("../data/words.txt")
    export_trimmed_polyglot_vectors(vocab, "../polyglot-zh.pkl", "../data/polyglot.trimmed.npz", 64)
    data = get_trimmed_polyglot_vectors("../data/polyglot.trimmed.npz")
Beispiel #7
0
def build_data(config, logger):
    """
    Procedure to build data
    """

    # Generators
    processing_word = get_processing_word(lowercase=config.lowercase)
    test = CoNLLDataset(config.test_filename, processing_word)
    dev = CoNLLDataset(config.dev_filename, processing_word)
    train = CoNLLDataset(config.train_filename, processing_word)

    # Build Word and Tag vocab
    logger.info("Build Word and Tag vocab...")
    vocab_words, vocab_poss, vocab_chunks, vocab_tags = get_vocabs(
        [train, dev, test])
    vocab = vocab_words
    vocab.add(UNK)
    vocab.add(NUM)

    # Save vocab
    write_vocab(vocab, config.words_filename)
    vocab_tags = [tags for tags in vocab_tags]
    vocab_tags.remove("O")
    vocab_tags.insert(0, "O")
    write_vocab(vocab_tags, config.tags_filename)

    # Trim GloVe Vectors
    vocab = load_vocab(config.words_filename)
    export_trimmed_glove_vectors(vocab, config.glove_filename,
                                 config.trimmed_filename, config.dim)

    # Build and save char vocab
    logger.info("Build chars vocab...")
    train = CoNLLDataset(config.train_filename)
    vocab_chars = get_char_vocab(train)
    write_vocab(vocab_chars, config.chars_filename)

    # Build and save Depstree
    processing_relation = get_processing_relation()
    dev_deps = DepsDataset(config.dev_deps_filename, processing_word,
                           processing_relation)
    train_deps = DepsDataset(config.train_deps_filename, processing_word,
                             processing_relation)

    logger.info("Build relations vocab...")
    vocab_relations = get_relations_vocabs([train_deps, dev_deps])
    vocab_relations.add(UNK)
    write_vocab(vocab_relations, config.relations_filename)
def build_data(config):
    """
    Procedure to build data

    Args:
        config: defines attributes needed in the function
    Returns:
        creates vocab files from the datasets
        creates a npz embedding file from trimmed glove vectors
    """
    processing_word = get_processing_word(lowercase=True)

    # Generators
    dev   = CoNLLDataset(config.dev_filename, processing_word)
    test  = CoNLLDataset(config.test_filename, processing_word)
    train = CoNLLDataset(config.train_filename, processing_word)

    # Build Word and Tag vocab
    vocab_words, vocab_tags = get_vocabs([train, dev, test])
    vocab_glove = get_glove_vocab(config.glove_filename)
    vocab = vocab_words & vocab_glove
    vocab.add(UNK)
    vocab.add(NUM)
    vocab.add(PAD)

    # Save vocab
    write_vocab(vocab, config.words_filename)
    write_vocab(vocab_tags, config.tags_filename)

    # Trim GloVe Vectors
    vocab = load_vocab(config.words_filename)
    export_trimmed_glove_vectors(vocab, config.glove_filename, 
                                config.trimmed_filename, config.dim)

    # Build and save char vocab
    train = CoNLLDataset(config.train_filename, processing_word)
    vocab_chars = get_char_vocab(train)
    write_vocab(vocab_chars, config.chars_filename)


    # Build and save type vocab
    vocab_types = set()
    print len(vocab_tags)
    for tag in vocab_tags:
        if tag != 'O':
            vocab_types.add(tag[2:])
    write_vocab(vocab_types, config.types_filename)
Beispiel #9
0
def main():
    # get config
    config = Config(load=False)

    # Generators
    train = get_datasets(config.filename_train)
    valid = get_datasets(config.filename_valid)
    test = get_datasets(config.filename_test)

    # add <start> to glove
    # add_glove(config.filename_glove, config.dim_word)

    # Build word vocab
    train_words = get_train_vocab(train)
    glove_vocab = get_glove_vocab(config.filename_glove)

    # train & glove(word to index)
    vocab = word2index(train_words, glove_vocab)
    # save vocab
    write_vocab(config.filename_words, vocab)

    # index to word
    index = index2word(vocab)
    write_vocab(config.filename_index, index)

    # embedding
    glove_embedding(config.filename_glove, config.filename_trimmed_glove,
                    config.dim_word, vocab, config.start, config.pad)

    # trim datasets
    get_trimmed_datasets(config.filename_trimmed_train, train, vocab,
                         config.max_length)
    get_trimmed_datasets(config.filename_trimmed_valid, valid, vocab,
                         config.max_length)
    get_trimmed_datasets(config.filename_trimmed_test, test, vocab,
                         config.max_length)
Beispiel #10
0
def build_data(config):
    """
    Procedure to build data

    Args:
        config: defines attributes needed in the function
    Returns:
        creates vocab files from the datasets
        creates a npz embedding file from trimmed glove vectors
    """
    processing_word = get_processing_word(lowercase=True)
    processing_word = get_processing_word(lowercase=True)

    # clean data
    train_filepath, dev_filepath_a = write_clear_data(
        config.train_filename,
        build_dev=config.build_dev_from_trainset,
        dev_ratio=config.dev_ratio)
    test_filepath, dev_filepath_b = write_clear_data(
        config.test_filename,
        build_dev=config.build_dev_from_testset,
        dev_ratio=config.dev_ratio)
    dev_filepath = dev_filepath_a or dev_filepath_b

    # Generators
    dev = Dataset(dev_filepath, processing_word)
    test = Dataset(test_filepath, processing_word)
    train = Dataset(train_filepath, processing_word)

    # Build Word and Tag vocab
    vocab_words, vocab_tags = get_vocabs([train, dev, test])
    vocab_glove = get_glove_vocab(config.glove_filename)

    vocab = vocab_words & vocab_glove
    vocab.add(UNK)
    vocab.add(NUM)

    # Save vocab
    write_vocab(vocab, config.words_filename)
    write_vocab(vocab_tags, config.tags_filename)

    # Trim GloVe Vectors
    vocab = load_vocab(config.words_filename)
    export_trimmed_glove_vectors(vocab, config.glove_filename,
                                 config.trimmed_filename, config.dim)

    # Build and save char vocab
    train = Dataset(train_filepath)
    vocab_chars = get_char_vocab(train)
    write_vocab(vocab_chars, config.chars_filename)
Beispiel #11
0
def get_semcor_vocab(config, processing_word, embedding_vocab):
    print("*** Semcor vocabulary ***")
    # Iterator
    raw = SemcorDataset(config.filename_raw, processing_word)

    # Word Vocab
    vocab_all_words, vocab_tags = get_vocabs([raw])
    vocab_words = vocab_all_words & embedding_vocab
    unk_words = vocab_all_words - embedding_vocab
    vocab_words = _add_vocab_constants(vocab_words)

    # Char Vocab
    raw = SemcorDataset(config.filename_raw)
    vocab_chars = get_char_vocab([raw])

    # Save vocab
    write_vocab(vocab_chars, config.filename_chars)
    write_vocab(vocab_words, config.filename_words)
    write_vocab(unk_words, "{}.unknown".format(config.filename_words))
    write_vocab(vocab_tags, config.filename_tags)

    return vocab_words, vocab_chars
Beispiel #12
0
def get_conll2003_vocab(config, processing_word, embedding_vocab):
    print("*** CoNLL-2003 vocabulary ***")
    # Iterators
    dev = CoNLL2003Dataset(config.filename_dev)
    test = CoNLL2003Dataset(config.filename_test)
    train = CoNLL2003Dataset(config.filename_train)

    vocab_all_words, vocab_tags = get_vocabs([train, dev, test])
    vocab_words = vocab_all_words & embedding_vocab
    unk_words = vocab_all_words - embedding_vocab
    vocab_words = _add_vocab_constants(vocab_words)

    dev = CoNLL2003Dataset(config.filename_dev)
    test = CoNLL2003Dataset(config.filename_test)
    train = CoNLL2003Dataset(config.filename_train)
    vocab_chars = get_char_vocab([train, dev, test])

    # Save vocab
    write_vocab(vocab_chars, config.filename_chars)
    write_vocab(vocab_words, config.filename_words)
    write_vocab(unk_words, "{}.unknown".format(config.filename_words))
    write_vocab(vocab_tags, config.filename_tags)

    return vocab_words, vocab_chars
Beispiel #13
0
def build_data(config):
    """
    Procedure to build data

    Args:
        config: defines attributes needed in the function
    Returns:
        creates vocab files from the datasets
        creates a npz embedding file from trimmed glove vectors
    """
    processing_word = get_processing_word(lowercase=config.lowercase)

    # Generators
    dev   = CoNLLDataset(config.dev_filename, processing_word)
    test  = CoNLLDataset(config.test_filename, processing_word)
    train = CoNLLDataset(config.train_filename, processing_word)

    # Build Word and Tag vocab
    vocab_words, vocab_tags = get_vocabs([train, dev, test])
    vocab_glove = get_glove_vocab(config.glove_filename)

    vocab = vocab_words & vocab_glove
    vocab.add(UNK)
    vocab.add(NUM)

    # Save vocab
    write_vocab(vocab, config.words_filename)
    write_vocab(vocab_tags, config.tags_filename)

    # Trim GloVe Vectors
    vocab = load_vocab(config.words_filename)
    export_trimmed_glove_vectors(vocab, config.glove_filename, 
                                config.trimmed_filename, config.dim)

    # Build and save char vocab
    train = CoNLLDataset(config.train_filename)
    vocab_chars = get_char_vocab(train)
    write_vocab(vocab_chars, config.chars_filename)
def main(pretrained_embeddings_file=None,
         filtered_embeddings_file="data/filtered_embeddings.txt"):
    words_file = "data/words.txt"
    tags_file = "data/tags.txt"
    chars_file = "data/chars.txt"
    test_file = 'data/eng.testa'
    train_file = 'data/eng.train'

    processing_word = get_processing_word(lowercase=False)

    test = CoNLLDataset(test_file, processing_word)
    train = CoNLLDataset(train_file, processing_word)

    vocab_words, vocab_tags = get_vocabs([train, test])
    vocab = set(vocab_words)
    if pretrained_embeddings_file:
        embedding_vocab = get_embedding_vocab(pretrained_embeddings_file)
        vocab &= embedding_vocab
        print('{} overlapping words'.format(len(vocab)))

    vocab.add(UNK)
    vocab.add(NUM)
    vocab = list(vocab)
    # TODO: there's probably no need for these anymore, check and remove, if this is the case
    vocab.insert(TOKEN2IDX[PAD], PAD)
    vocab.insert(TOKEN2IDX[START_TAG], START_TAG)
    vocab.insert(TOKEN2IDX[STOP_TAG], STOP_TAG)
    print(len(vocab))

    write_vocab(vocab, words_file)
    write_vocab(vocab_tags, tags_file)

    if pretrained_embeddings_file:
        filter_embeddings_in_vocabulary(words_file, pretrained_embeddings_file,
                                        filtered_embeddings_file)

    vocab_chars = get_char_vocab(vocab_words)
    write_vocab(vocab_chars, chars_file)
Beispiel #15
0
def build_data(config):
    annotations = []
    meta_filename = 'sw%s%s-ms98-a-trans.text'  # % (file_id, speaker_id)

    for idx in os.listdir(config.wimp_corpus):
        idx_path = os.path.join(config.wimp_corpus, idx)
        if os.path.isfile(idx_path):
            continue

        for file_id in os.listdir(idx_path):
            folder = os.path.join(idx_path, file_id)
            if os.path.isfile(folder):
                continue

            wimp_trans_files = [
                os.path.join(folder, meta_filename % (file_id, 'A')),
                os.path.join(folder, meta_filename % (file_id, 'B'))
            ]

            swd_trans_files = [
                os.path.join(config.swd_transcripts, idx, file_id,
                             meta_filename % (file_id, 'A')),
                os.path.join(config.swd_transcripts, idx, file_id,
                             meta_filename % (file_id, 'B'))
            ]

            for i, wimp_trans_file in enumerate(wimp_trans_files):
                swd_trans_file = swd_trans_files[i]
                file_id, speaker = swd_trans_file.split("/")[-2:]
                speaker = speaker[6]
                with open(wimp_trans_file) as w_file_obj, open(
                        swd_trans_file) as s_file_obj:
                    for line_num, (anns_, wrds_) in enumerate(
                            zip(w_file_obj, s_file_obj)):
                        sentence = []
                        anns = anns_.strip().split(' ')[3:]
                        wrds = wrds_.strip().split(' ')[3:]
                        assert(len(anns) == len(wrds)), \
                        "file mismatch, line %d : %s and %s" % (line_num, swd_trans_file, wimp_trans_file)

                        for id_, wrd in enumerate(wrds):
                            wrd = clean_word(wrd)
                            if wrd != '':
                                sentence.append([(file_id, line_num, speaker),
                                                 wrd,
                                                 float(anns[id_])])

                        if len(sentence) != 0:
                            annotations.append(sentence)

    random.shuffle(annotations)

    #80% for training, 10% dev, 10% test
    d_train = annotations[:0.8 * len(annotations)]
    d_test = annotations[0.8 * len(annotations):0.9 * len(annotations)]
    d_dev = annotations[0.9 * len(annotations):]

    def prep_text_data(D, outfile):
        with open(outfile, 'w') as f:
            for sent in D:
                for _, word, label in sent:
                    f.write("%s %f\n" % (word, label))
                f.write("\n")

    prep_text_data(d_train, config.train_filename)
    prep_text_data(d_test, config.test_filename)
    prep_text_data(d_dev, config.dev_filename)

    processing_word = get_processing_word(lowercase=True)

    # Generators
    dev = AnnotationDataset(config.dev_filename, processing_word)
    test = AnnotationDataset(config.test_filename, processing_word)
    train = AnnotationDataset(config.train_filename, processing_word)

    # Build Word and Tag vocab
    # Vocabulary is built using training data
    vocab_words, vocab_tags = get_vocabs([train])
    vocab_glove = get_glove_vocab(config.glove_filename)

    vocab = vocab_words & vocab_glove
    vocab.add(UNK)
    vocab.add(NUM)

    # Save vocab
    write_vocab(vocab, config.words_filename)
    write_vocab(vocab_tags, config.tags_filename)

    # Trim GloVe Vectors
    vocab = load_vocab(config.words_filename)
    export_trimmed_glove_vectors(vocab, config.glove_filename,
                                 config.trimmed_filename, config.dim)

    # Build and save char vocab
    train = AnnotationDataset(config.train_filename)
    vocab_chars = get_char_vocab(train)
    write_vocab(vocab_chars, config.chars_filename)
Beispiel #16
0
def build_data(config, logger):
    """
    Procedure to build data
    """
    processing_word = get_processing_word(lowercase=config.lowercase)

    # Generators
    test = CoNLLDataset(config.test_filename, processing_word)
    dev = CoNLLDataset(config.dev_filename, processing_word)
    train = CoNLLDataset(config.train_filename, processing_word)

    # Build Word and Tag vocab
    print("Build Word and Tag vocab...")
    vocab_words, vocab_poss, vocab_chunks, \
    vocab_aspect_tags, vocab_polarity_tags, vocab_joint_tags = get_vocabs([train, dev, test])
    vocab = vocab_words
    vocab.add(UNK)
    vocab.add(NUM)

    # Save vocab
    print("Dealing words vocab...")
    write_vocab(vocab, config.words_filename)
    print("Dealing poss vocab...")
    write_vocab(vocab_poss, config.poss_filename)

    vocab_chunks = [tags for tags in vocab_chunks]
    if "NO" in vocab_chunks:
        vocab_chunks.remove("NO")
        vocab_chunks.insert(0, "NO")
    else:
        logger.error(">>> vocab_chunks used as mpqa has something wrong!")
    print("Dealing chunks vocab...")
    write_vocab(vocab_chunks, config.chunks_filename)

    vocab_aspect_tags = [tags for tags in vocab_aspect_tags]
    vocab_aspect_tags.remove("O")
    vocab_aspect_tags.insert(0, "O")
    vocab_polarity_tags = [tags for tags in vocab_polarity_tags]
    vocab_polarity_tags.remove("O")
    vocab_polarity_tags.insert(0, "O")
    vocab_joint_tags = [tags for tags in vocab_joint_tags]
    vocab_joint_tags.remove("O")
    vocab_joint_tags.insert(0, "O")
    print("Dealing aspect_tags vocab...")
    write_vocab(vocab_aspect_tags, config.aspect_tags_filename)
    print("Dealing polarity_tags vocab...")
    write_vocab(vocab_polarity_tags, config.polarity_tags_filename)
    print("Dealing joint_tags vocab...")
    write_vocab(vocab_joint_tags, config.joint_tags_filename)

    vocab = load_vocab(config.words_filename)
    export_trimmed_glove_vectors(vocab, config.domain_filename,
                                 config.domain_trimmed_filename,
                                 config.dim_domain)
    export_trimmed_glove_vectors(vocab, config.general_filename,
                                 config.general_trimmed_filename,
                                 config.dim_general)
Beispiel #17
0
vocab_words = set()
vocab_tags = set()
vocab_chars = set()
file = open('data/all.txt')
for line in file:
	line = line.strip()
	if len(line) == 0:
		continue
	token, tag = line.split(' ')
	print token, tag
	for c in token:
		vocab_chars.add(c)
	vocab_words.add(token)
	vocab_tags.add(tag)

# Build Word and Tag vocab
vocab_glove = get_glove_vocab(config.glove_filename)

vocab = vocab_words & vocab_glove
vocab.add(UNK)
vocab.add(NUM)

# Save vocabs
write_vocab(vocab, config.words_filename)
write_vocab(vocab_tags, config.tags_filename)
write_vocab(vocab_chars, config.chars_filename)

# Trim GloVe Vectors
vocab = load_vocab(config.words_filename)
export_trimmed_glove_vectors(vocab, config.glove_filename, config.trimmed_filename, config.dim)
Beispiel #18
0
def build_data(config):
    """
    Procedure to build data

    Args:
        config: defines attributes needed in the function
    Returns:
        creates vocab files from the datasets
        creates a npz embedding file from trimmed glove vectors
    """
    processing_word = get_processing_word(lowercase=True)

    # Generators
    dev = CoNLLDataset(config.dev_filename, processing_word)
    test = CoNLLDataset(config.test_filename, processing_word)
    train = CoNLLDataset(config.train_filename, processing_word)

    # Build Word and Tag vocab
    vocab_words, vocab_tags, vocab_pos = get_vocabs([train, dev,
                                                     test])  #pos adding-----
    vocab_glove = get_glove_vocab(config.glove_filename)
    vocab_dic = get_dic_vocab(config.dic_filename, 1)  #add dic vector get
    vocab_syl = get_dic_vocab(config.syl_filename, 1)  #add syl vector
    vocab_morph = get_morph_vocab(config.morph_vec_filename)  #morph vector get

    vocab = vocab_words & vocab_glove
    vocab.add(UNK.decode('utf-8'))
    vocab.add(NUM.decode('utf-8'))

    word_dic = vocab_dic  #add dic
    word_dic.add(UNK.decode('utf-8'))
    word_dic.add(NUM.decode('utf-8'))

    word_syl = vocab_syl  #add syl
    word_syl.add(UNK.decode('utf-8'))
    word_syl.add(NUM.decode('utf-8'))

    word_morph = vocab_morph  # add morph
    word_morph.add(UNK.decode('utf-8'))
    word_morph.add(NUM.decode('utf-8'))

    vocab_pos.add(UNK.decode('utf-8'))

    # Save vocab
    write_vocab(vocab, config.words_filename)
    write_vocab(vocab_tags, config.tags_filename)
    write_vocab(word_dic, config.word_dic_filename)  #add dic
    write_vocab(word_syl, config.word_syl_filename)  #add syl
    write_vocab(word_morph, config.morphs_filename)  #add morph
    write_vocab(vocab_pos, config.posTag_filename)  #add pos

    # Trim GloVe Vectors(pretrain vector)
    vocab = load_vocab(config.words_filename)
    export_trimmed_glove_vectors(vocab, config.glove_filename,
                                 config.trimmed_filename, config.dim)
    word_dic = load_vocab(config.word_dic_filename)  #dic add
    export_dic_vectors(word_dic, config.dic_filename, config.exported_filename,
                       config.dic_dim)
    word_syl = load_vocab(config.word_syl_filename)  #syl add
    export_syl_vectors(word_syl, config.syl_filename,
                       config.exported_sfilename, config.syl_dim)
    word_morph = load_vocab(config.morphs_filename)  #morph add
    export_morph_vectors(word_morph, config.morph_vec_filename,
                         config.exported_mfilename, config.dim_morph)
    vocab_pos = load_vocab(config.posTag_filename)  #pos add
    export_pos_vectors(vocab_pos, config.pos_vec_filename,
                       config.exported_pfilename, config.dim_pos)

    # Build and save char vocab, morph vocab
    train = CoNLLDataset(config.train_filename)
    vocab_chars = get_char_vocab(train)
    write_vocab(vocab_chars, config.chars_filename)
Beispiel #19
0
config = Config()

processing_word = get_processing_word(lowercase=True)

# Generators
dev   = CoNLLDataset(config.filename_dev, processing_word)
test  = CoNLLDataset(config.filename_test, processing_word)
train = CoNLLDataset(config.filename_train, processing_word)

# Build Word and Tag vocab
vocab_words, vocab_tags = get_vocabs([train, dev, test])
vocab_glove = get_glove_vocab(config.filename_glove)
vocab = vocab_words & vocab_glove
vocab.add(config.UNK)
vocab.add(config.NUM)

# Save vocab
write_vocab(vocab, config.filename_words)
write_vocab(vocab_tags, config.filename_tags)

# Trim GloVe Vectors
vocab = load_vocab(config.filename_words)
export_trimmed_glove_vectors(vocab, config.filename_glove,
                            config.filename_trimmed, config.dim_word)

# Build and save char vocab
train = CoNLLDataset(config.filename_train)
vocab_chars = get_char_vocab(train)
write_vocab(vocab_chars, config.filename_chars)

# === Build model ===

model = _build_model(embeddings)
# Optimizer: Adam shows best results
adam_op = Adam(lr=lr, decay=lr_decay)
model.compile(optimizer=adam_op, loss='categorical_crossentropy', metrics=['accuracy'])

# train model
print('Beginning model fitting...')
model.fit([word_ids_arr, char_ids_arr], labels_arr_one_hot, batch_size=batch_size, epochs=nepochs,
          validation_data=([word_ids_arr_valid, char_ids_arr_valid], labels_arr_one_hot_valid))

# Export keras model to TF SavedModel format
print('Exporting SavedModel to {}'.format(result_dir))
model.trainable = False
with tf.keras.backend.get_session() as sess:
    tf.saved_model.simple_save(
        sess,
        result_dir,
        inputs={t.name: t for t in model.inputs},
        outputs={t.name: t for t in model.outputs})

# export vocabs
print('Writing vocab files to {}'.format(result_dir))
write_vocab(vocab_words, '{}/words.txt'.format(result_dir))
write_vocab(vocab_chars, '{}/chars.txt'.format(result_dir))
write_vocab(vocab_tags, '{}/tags.txt'.format(result_dir))

print('Completed training!')
words_filename = "{}/words.txt".format(output_dir)
tags_filename = "{}/tags.txt".format(output_dir)
chars_filename = "{}/chars.txt".format(output_dir)

processing_word = get_processing_word(lowercase=True)

train = CoNLLDataset(train_filename, processing_word)
valid = CoNLLDataset(valid_filename, processing_word)

# Build word and tag vocabs
vocab_words, vocab_tags = get_vocabs([train, valid])
vocab_glove = get_glove_vocab(glove_filename)

vocab = vocab_words & vocab_glove
vocab.add(UNK)
vocab.add(NUM)

# Save vocab
write_vocab(vocab, words_filename)
write_vocab(vocab_tags, tags_filename)

# Trim GloVe Vectors
vocab = load_vocab(words_filename)
export_trimmed_glove_vectors(vocab, glove_filename, filename_trimmed, dim_word)

# Build and save char vocab
train = CoNLLDataset(train_filename)
vocab_chars = get_char_vocab(train)
write_vocab(vocab_chars, chars_filename)