Python CoNLLDatasetの例、data_utils.CoNLLDataset Pythonの例

コード例 #1

0

ファイルを表示

ファイル: main.py プロジェクト: yvespeirsman/sequence_tagging

def main(config):
    # load vocabs
    vocab_words = load_vocab(config.words_filename)
    vocab_tags  = load_vocab(config.tags_filename)
    vocab_chars = load_vocab(config.chars_filename)

    # get processing functions
    processing_word = get_processing_word(vocab_words, vocab_chars,
                    lowercase=True, chars=config.chars)
    processing_tag  = get_processing_word(vocab_tags, 
                    lowercase=False, allow_unk=False)

    # get pre trained embeddings
    embeddings = get_trimmed_glove_vectors(config.trimmed_filename)

    # create dataset
    dev   = CoNLLDataset(config.dev_filename, processing_word,
                        processing_tag, config.max_iter)
    test  = CoNLLDataset(config.test_filename, processing_word,
                        processing_tag, config.max_iter)
    train = CoNLLDataset(config.train_filename, processing_word,
                        processing_tag, config.max_iter)

    # build model
    model = NERModel(config, embeddings, ntags=len(vocab_tags),
                                         nchars=len(vocab_chars))
    model.build()

    # train, evaluate and interact
    model.train(train, dev, vocab_tags)
    model.evaluate(test, vocab_tags)
    model.interactive_shell(vocab_tags, processing_word)

コード例 #2

0

ファイルを表示

def build_data(config):
    """
    Procedure to build data
    Args:
        config: defines attributes needed in the function
    Returns:
        creates vocab files from the datasets
        creates a npz embedding file from trimmed glove vectors
    """
    processing_word = get_processing_word(lowercase=config.lowercase)

    # Generators
    dev = CoNLLDataset(config.dev_filename, processing_word)
    test = CoNLLDataset(config.test_filename, processing_word)
    train = CoNLLDataset(config.train_filename, processing_word)

    # Build Word and Tag vocab
    vocab_words, vocab_tags = get_vocabs([train, dev, test])
    vocab_glove = get_glove_vocab(config.glove_filename)

    vocab = vocab_words & vocab_glove
    vocab.add(UNK)
    vocab.add(NUM)

    # Save vocab
    write_vocab(vocab, config.words_filename)
    write_vocab(vocab_tags, config.tags_filename)

    # Trim GloVe Vectors
    vocab = load_vocab(config.words_filename)
    export_trimmed_glove_vectors(vocab, config.glove_filename,
                                 config.trimmed_filename, config.dim)

コード例 #3

0

ファイルを表示

def build_data(config):
    processing_word = get_processing_word()

    dev = CoNLLDataset(config.dev_filename, processing_word)
    test = CoNLLDataset(config.test_filename, processing_word)
    train = CoNLLDataset(config.train_filename, processing_word)

    vocab_words, vocab_tags, vocab_poss = get_vocabs([train, dev, test])
    vocab_glove = get_glove_vocab(config.glove_filename)

    vocab = vocab_words & vocab_glove
    vocab.add(UNK)
    vocab.add(NUM)

    write_vocab(vocab, config.words_filename)
    write_vocab(vocab_tags, config.tags_filename)
    write_vocab(vocab_poss, config.poss_filename)

    vocab = load_vocab(config.words_filename)
    export_trimmed_glove_vectors(vocab, config.glove_filename,
                                 config.trimmed_filename, config.dim)

    train = CoNLLDataset(config.train_filename)
    vocab_chars = get_char_vocab(train)
    write_vocab(vocab_chars, config.chars_filename)

コード例 #4

0

ファイルを表示

def build_data(config):
    """
    Procedure to build data

    Args:
        config: defines attributes needed in the function
    Returns:
        creates vocab files from the datasets
        creates a npz embedding file from trimmed glove vectors
    """
    processing_word = get_processing_word(lowercase=True)

    # Generators
    dev = CoNLLDataset(config.dev_filename, processing_word)
    #test  = CoNLLDataset(config.test_filename, processing_word)
    train = CoNLLDataset(config.train_filename, processing_word)

    # Build Word and Tag vocab
    vocab_words, vocab_tags, vocab_pos = get_vocabs([train, dev])
    vocab_glove = get_glove_vocab(config.glove_filename)
    vocab_glove_uni = get_glove_vocab(config.glove_uni_filename)

    vocab_feature = get_pos_glove_vocab(config.glove_filename)

    # vocab = vocab_words & vocab_glove
    vocab = vocab_glove | vocab_words
    vocab.add(UNK)
    vocab.add(NUM)

    vocab_pos = vocab_feature
    vocab_pos.add(UNK)
    vocab_pos.add(NUM)

    # Save vocab
    write_vocab(vocab, config.words_filename)
    write_vocab(vocab_glove_uni, config.uni_words_filename)
    write_vocab(vocab_tags, config.tags_filename)
    write_vocab(vocab_pos, config.pos_filename)

    # Trim GloVe Vectors
    vocab = load_vocab(config.words_filename)
    export_trimmed_glove_vectors(vocab, config.glove_filename,
                                 config.trimmed_filename, config.t_dim)

    vocab = load_vocab(config.uni_words_filename)

    export_trimmed_uni_vectors(vocab, config.NEdic_filename,
                               config.trimmed_dic, config.dic_dim)

    export_trimmed_uni_vectors(vocab, config.glove_uni_filename,
                               config.uni_trimmed_filename, config.dim)

    vocab_feature = load_vocab(config.pos_filename)
    export_trimmed_pos_vectors(vocab_feature, config.glove_feature,
                               config.feature_trimmed_filename, config.pos_dim)

    # Build and save char vocab
    train = CoNLLDataset(config.train_filename)
    vocab_chars = get_char_vocab(train)
    write_vocab(vocab_chars, config.chars_filename)

コード例 #5

0

ファイルを表示

def main(config):
    # load vocabs
    vocab_words, idx2words = load_vocab(config.words_filename)
    vocab_tags, _  = load_vocab(config.tags_filename)
    vocab_chars, _ = load_vocab(config.chars_filename)
    vocab_pos, _ = load_vocab(config.pos_filename)


    # get processing functions
    processing_word = get_processing_word(vocab_words, vocab_chars,
                    lowercase=True, chars=config.chars)

    processing_tag  = get_processing_word(vocab_tags, 
                    lowercase=False)

    processing_pos = get_processing_word(vocab_pos,
                                         lowercase=False)




    # get pre trained embeddings
    embeddings = get_trimmed_glove_vectors(config.trimmed_filename)
    embeddings_uni = get_trimmed_glove_vectors(config.uni_trimmed_filename)
    pos_embeddings = get_trimmed_glove_vectors(config.feature_trimmed_filename)
    NE_dic = get_trimmed_glove_vectors(config.trimmed_dic)


    # create dataset
    dev   = CoNLLDataset(config.dev_filename, processing_word,
                        processing_tag, processing_pos, config.max_iter)

    train = CoNLLDataset(config.train_filename, processing_word,
                        processing_tag, processing_pos, config.max_iter)
    
    # build model
    model = NERModel(config, embeddings, embeddings_uni,
                     pos_embeddings, ntags=len(vocab_tags), nchars=len(vocab_chars), vocab_words=idx2words,
                    NE_dic=NE_dic)
    model.build()

    # train, evaluate and interact
    if state == "train":
        model.train(train, dev, vocab_tags)

    elif state == "evaluate":
        model.evaluate(dev, vocab_tags)

    else: #state == predict
        convert(file)
        t2o("data_format/test_convert.txt","data_format/test.txt")
        test = CoNLLDataset(config.test_filename, processing_word,
                            processing_tag, processing_pos, config.max_iter)

        model.evaluate(test, vocab_tags)

        tagging("data_format/test_convert.txt")

コード例 #6

0

ファイルを表示

ファイル: build_data.py プロジェクト: vutran0230/DOER

def build_data(config, logger):
    """
    Procedure to build data
    """
    processing_word = get_processing_word(lowercase=config.lowercase)

    # Generators
    test = CoNLLDataset(config.test_filename, processing_word)
    dev = CoNLLDataset(config.dev_filename, processing_word)
    train = CoNLLDataset(config.train_filename, processing_word)

    # Build Word and Tag vocab
    print("Build Word and Tag vocab...")
    vocab_words, vocab_poss, vocab_chunks, \
    vocab_aspect_tags, vocab_polarity_tags, vocab_joint_tags = get_vocabs([train, dev, test])
    vocab = vocab_words
    vocab.add(UNK)
    vocab.add(NUM)

    # Save vocab
    print("Dealing words vocab...")
    write_vocab(vocab, config.words_filename)
    print("Dealing poss vocab...")
    write_vocab(vocab_poss, config.poss_filename)

    vocab_chunks = [tags for tags in vocab_chunks]
    if "NO" in vocab_chunks:
        vocab_chunks.remove("NO")
        vocab_chunks.insert(0, "NO")
    else:
        logger.error(">>> vocab_chunks used as mpqa has something wrong!")
    print("Dealing chunks vocab...")
    write_vocab(vocab_chunks, config.chunks_filename)

    vocab_aspect_tags = [tags for tags in vocab_aspect_tags]
    vocab_aspect_tags.remove("O")
    vocab_aspect_tags.insert(0, "O")
    vocab_polarity_tags = [tags for tags in vocab_polarity_tags]
    vocab_polarity_tags.remove("O")
    vocab_polarity_tags.insert(0, "O")
    vocab_joint_tags = [tags for tags in vocab_joint_tags]
    vocab_joint_tags.remove("O")
    vocab_joint_tags.insert(0, "O")
    print("Dealing aspect_tags vocab...")
    write_vocab(vocab_aspect_tags, config.aspect_tags_filename)
    print("Dealing polarity_tags vocab...")
    write_vocab(vocab_polarity_tags, config.polarity_tags_filename)
    print("Dealing joint_tags vocab...")
    write_vocab(vocab_joint_tags, config.joint_tags_filename)

    vocab = load_vocab(config.words_filename)
    export_trimmed_glove_vectors(vocab, config.domain_filename,
                                 config.domain_trimmed_filename,
                                 config.dim_domain)
    export_trimmed_glove_vectors(vocab, config.general_filename,
                                 config.general_trimmed_filename,
                                 config.dim_general)

コード例 #7

0

ファイルを表示

def build_data(config, logger):
    """
    Procedure to build data
    """

    # Generators
    processing_word = get_processing_word(lowercase=config.lowercase)
    test = CoNLLDataset(config.test_filename, processing_word)
    dev = CoNLLDataset(config.dev_filename, processing_word)
    train = CoNLLDataset(config.train_filename, processing_word)

    # Build Word and Tag vocab
    logger.info("Build Word and Tag vocab...")
    vocab_words, vocab_poss, vocab_chunks, vocab_tags = get_vocabs(
        [train, dev, test])
    vocab = vocab_words
    vocab.add(UNK)
    vocab.add(NUM)

    # Save vocab
    write_vocab(vocab, config.words_filename)
    vocab_tags = [tags for tags in vocab_tags]
    vocab_tags.remove("O")
    vocab_tags.insert(0, "O")
    write_vocab(vocab_tags, config.tags_filename)

    # Trim GloVe Vectors
    vocab = load_vocab(config.words_filename)
    export_trimmed_glove_vectors(vocab, config.glove_filename,
                                 config.trimmed_filename, config.dim)

    # Build and save char vocab
    logger.info("Build chars vocab...")
    train = CoNLLDataset(config.train_filename)
    vocab_chars = get_char_vocab(train)
    write_vocab(vocab_chars, config.chars_filename)

    # Build and save Depstree
    processing_relation = get_processing_relation()
    dev_deps = DepsDataset(config.dev_deps_filename, processing_word,
                           processing_relation)
    train_deps = DepsDataset(config.train_deps_filename, processing_word,
                             processing_relation)

    logger.info("Build relations vocab...")
    vocab_relations = get_relations_vocabs([train_deps, dev_deps])
    vocab_relations.add(UNK)
    write_vocab(vocab_relations, config.relations_filename)

コード例 #8

0

ファイルを表示

def main(config):
    # load vocabs
    vocab_words = load_vocab(config.words_filename)
    vocab_tags = load_vocab(config.tags_filename)
    vocab_chars = load_vocab(config.chars_filename)
    vocab_iob = {"O": 0, "B": 1, "I": 2}
    vocab_type = {"LOC": 0, "PER": 1, "ORG": 2, "MISC": 3}

    # get processing functions
    processing_word = get_processing_word(vocab_words,
                                          vocab_chars,
                                          lowercase=True,
                                          chars=config.chars)
    processing_tag = get_processing_word(vocab_tags, lowercase=False)
    processing_iob = get_processing_word(vocab_iob, lowercase=False)
    processing_type = get_processing_word(vocab_type, lowercase=False)

    # get pre trained embeddings
    embeddings = get_trimmed_glove_vectors(config.trimmed_filename)

    # create dataset
    dev = CoNLLDataset(config.dev_filename, processing_word, processing_tag,
                       processing_iob, processing_type, config.max_iter,
                       config.chars)
    test = CoNLLDataset(config.test_filename, processing_word, processing_tag,
                        processing_iob, processing_type, config.max_iter,
                        config.chars)
    train = CoNLLDataset(config.train_filename, processing_word,
                         processing_tag, processing_iob, processing_type,
                         config.max_iter, config.chars)

    model = NERModel(config,
                     embeddings,
                     ntags=len(vocab_tags),
                     nchars=len(vocab_chars),
                     niob=3,
                     ntype=4)

    model.build()

    # train, evaluate and interact
    print vocab_tags
    model.train(train, dev, vocab_tags)

    stime = time.time()
    model.evaluate(test, vocab_tags)
    print time.time() - stime

コード例 #9

0

ファイルを表示

ファイル: build_data.py プロジェクト: skobekuang/neural-network-tagger

def build_data(config):
    """
    Procedure to build data

    Args:
        config: defines attributes needed in the function
    Returns:
        creates vocab files from the datasets
        creates a npz embedding file from trimmed glove vectors
    """
    processing_word = get_processing_word(lowercase=True)

    # Generators
    dev   = CoNLLDataset(config.dev_filename, processing_word)
    test  = CoNLLDataset(config.test_filename, processing_word)
    train = CoNLLDataset(config.train_filename, processing_word)

    # Build Word and Tag vocab
    vocab_words, vocab_tags = get_vocabs([train, dev, test])
    vocab_glove = get_glove_vocab(config.glove_filename)
    vocab = vocab_words & vocab_glove
    vocab.add(UNK)
    vocab.add(NUM)
    vocab.add(PAD)

    # Save vocab
    write_vocab(vocab, config.words_filename)
    write_vocab(vocab_tags, config.tags_filename)

    # Trim GloVe Vectors
    vocab = load_vocab(config.words_filename)
    export_trimmed_glove_vectors(vocab, config.glove_filename, 
                                config.trimmed_filename, config.dim)

    # Build and save char vocab
    train = CoNLLDataset(config.train_filename, processing_word)
    vocab_chars = get_char_vocab(train)
    write_vocab(vocab_chars, config.chars_filename)


    # Build and save type vocab
    vocab_types = set()
    print len(vocab_tags)
    for tag in vocab_tags:
        if tag != 'O':
            vocab_types.add(tag[2:])
    write_vocab(vocab_types, config.types_filename)

コード例 #10

0

ファイルを表示

ファイル: main.py プロジェクト: x3xiong/GeoAI2019Geoparser

def main(config):
    # load vocabs
    vocab_words = load_vocab(config.words_filename)
    vocab_tags = load_vocab(config.tags_filename)
    vocab_chars = load_vocab(config.chars_filename)
    vocab_pref_suff = load_vocab(
        config.PS_filename)  ############### For prefix and suffix
    vocab_pref_suff_2 = load_vocab(config.PS_filename_2)
    vocab_pref_suff_4 = load_vocab(config.PS_filename_4)
    # get processing functions
    processing_word = get_processing_word(vocab_words,
                                          vocab_chars,
                                          vocab_pref_suff,
                                          vocab_pref_suff_2,
                                          vocab_pref_suff_4,
                                          lowercase=True,
                                          chars=config.chars,
                                          Pref_Suff=config.pref_suff)
    processing_tag = get_processing_word(vocab_tags,
                                         lowercase=False,
                                         Geoparser=True)

    # get pre trained embeddings
    embeddings = get_trimmed_glove_vectors(config.trimmed_filename)

    ##create dataset
    dev = CoNLLDataset(
        config.dev_filename,
        processing_word,  ############ Here dev, test and train have the raw words and tags. Now we have to map these to corresponding word index
        processing_tag,
        config.max_iter
    )  ############ and tags index. Therefore, when we do model.evaluate in below lines, it calls run_evaluate in run_epoch function
    test = CoNLLDataset(config.test_filename, processing_word, processing_tag,
                        config.max_iter)
    train = CoNLLDataset(config.train_filename, processing_word,
                         processing_tag, config.max_iter)

    # build model
    model = NERModel(config,
                     embeddings,
                     ntags=len(vocab_tags),
                     nchars=len(vocab_chars))
    model.build()

    # train, evaluate and interact
    model.train(train, dev, vocab_tags)
    model.evaluate(test, vocab_tags)

コード例 #11

0

ファイルを表示

ファイル: train.py プロジェクト: Abhishe24021996/Text_segmentor

def train(config_path,continue_=False):
    #reading hyperparameters
    config_params = json.load(open(config_path))
    
    #loading hyperparams
    config_data = config(**config_params, load=False)
    #Creating data vocab.txt, chars.txt, tags.txt, and embeddings
    data_builder(config_data)
    
    #creating loading the data created earlier
    config_train = config(**config_params,load=True)
    
    #build model
    model = BILSTM_CRF(config_train)
    model.build()
    
    if continue_:
        try:
            model_path = config_params["model_path"]
            print("Loading weights from path:: ",model_path)
            model.restore_session(model_path)
            model.reinitialize_weights("proj")
            print("Restoring weights")
        except:
            print("Restoring weights failed")
            print("training from scratch")
            print(e)
            input()
    
    
    
    #data generators
    dev   = CoNLLDataset(config_train.train, config_train.process_words,
                         config_train.process_tags)
    train = CoNLLDataset(config_train.test, config_train.process_words,
                         config_train.process_tags)
    
    # train model
    model.train(train, dev)
    
    print("Trainig Complete!")
    print("Remove the events.tf files from the output directory if you don't need them. Note that removing them won't affect the predictions in anyway")

コード例 #12

0

ファイルを表示

ファイル: main_pos.py プロジェクト: skobekuang/neural-network-tagger

def main(config):
    # load vocabs
    vocab_words = load_vocab(config.words_filename)
    vocab_tags  = load_vocab(config.tags_filename)
    vocab_chars = load_vocab(config.chars_filename)
    dictionary = load_vocab("data/types.txt")
    types_dic = collections.OrderedDict([(v, k) for k, v in dictionary.items()])
    vocab_iob = {"O":0, "B":1, "I":2}
    vocab_type = load_vocab(config.types_filename)
    print vocab_type
    # get processing functions
    processing_word = get_processing_word(vocab_words, vocab_chars,
                    lowercase=True, chars=config.chars)
    processing_tag  = get_processing_word(vocab_tags, 
                    lowercase=False)
    processing_iob = get_processing_word(vocab_iob, 
                    lowercase=False)
    processing_type = get_processing_word(vocab_type, 
                    lowercase=False)

    # get pre trained embeddings
    embeddings = get_trimmed_glove_vectors(config.trimmed_filename)

    # create dataset
    dev   = CoNLLDataset(config.dev_filename, processing_word,
                        processing_tag, processing_iob, processing_type, config.max_iter, config.chars)
    test  = CoNLLDataset(config.test_filename, processing_word,
                        processing_tag, processing_iob, processing_type, config.max_iter, config.chars)
    train = CoNLLDataset(config.train_filename, processing_word,
                        processing_tag, processing_iob, processing_type, config.max_iter, config.chars)

    ntype = len(vocab_type)
    model = POSmodel(config, embeddings, ntags=len(vocab_tags),
                                         nchars=len(vocab_chars),
                                         niob=3,
                                         ntype=ntype)

    model.build()

    model.train(train, dev, vocab_type)

    model.evaluate(test, vocab_type)

コード例 #13

0

ファイルを表示

ファイル: build_data.py プロジェクト: woshiduwei/pytorch-bilstm-crf

def main(pretrained_embeddings_file=None,
         filtered_embeddings_file="data/filtered_embeddings.txt"):
    words_file = "data/words.txt"
    tags_file = "data/tags.txt"
    chars_file = "data/chars.txt"
    test_file = 'data/eng.testa'
    train_file = 'data/eng.train'

    processing_word = get_processing_word(lowercase=False)

    test = CoNLLDataset(test_file, processing_word)
    train = CoNLLDataset(train_file, processing_word)

    vocab_words, vocab_tags = get_vocabs([train, test])
    vocab = set(vocab_words)
    if pretrained_embeddings_file:
        embedding_vocab = get_embedding_vocab(pretrained_embeddings_file)
        vocab &= embedding_vocab
        print('{} overlapping words'.format(len(vocab)))

    vocab.add(UNK)
    vocab.add(NUM)
    vocab = list(vocab)
    # TODO: there's probably no need for these anymore, check and remove, if this is the case
    vocab.insert(TOKEN2IDX[PAD], PAD)
    vocab.insert(TOKEN2IDX[START_TAG], START_TAG)
    vocab.insert(TOKEN2IDX[STOP_TAG], STOP_TAG)
    print(len(vocab))

    write_vocab(vocab, words_file)
    write_vocab(vocab_tags, tags_file)

    if pretrained_embeddings_file:
        filter_embeddings_in_vocabulary(words_file, pretrained_embeddings_file,
                                        filtered_embeddings_file)

    vocab_chars = get_char_vocab(vocab_words)
    write_vocab(vocab_chars, chars_file)

コード例 #14

0

ファイルを表示

def main():
    """Procedure to build data

    You MUST RUN this procedure. It iterates over the whole dataset (train,
    dev and test) and extract the vocabularies in terms of words, tags, and
    characters. Having built the vocabularies it writes them in a file. The
    writing of vocabulary in a file assigns an id (the line #) to each word.
    It then extract the relevant GloVe vectors and stores them in a np array
    such that the i-th entry corresponds to the i-th word in the vocabulary.


    Args:
        config: (instance of Config) has attributes like hyper-params...

    """
    # get config and processing of words
    config = Config(load=False)
    processing_word = get_processing_word()

    # Generators
    dev = CoNLLDataset(config.filename_dev, processing_word)
    test = CoNLLDataset(config.filename_test, processing_word)
    train = CoNLLDataset(config.filename_train, processing_word)

コード例 #15

0

ファイルを表示

ファイル: main.py プロジェクト: jameshsu007/ner-lstm-crf-1

from model import NERModel
from config import Config

config = Config()

vocab_words = load_vocab(config.words_filename)  # words idx
vocab_tags = load_vocab(config.tags_filename)  # tags idx
vocab_chars = load_vocab(config.chars_filename)  # char idx

processing_word = get_processing_word(vocab_words, vocab_chars)
processing_tag = get_processing_word(vocab_tags)

embeddings = get_trimmed_glove_vectors(config.trimmed_filename)

dev = CoNLLDataset(filename=config.dev_filename,
                   processing_word=processing_word,
                   processing_tag=processing_tag,
                   max_iter=config.max_iter)
test = CoNLLDataset(filename=config.test_filename,
                    processing_word=processing_word,
                    processing_tag=processing_tag,
                    max_iter=config.max_iter)
train = CoNLLDataset(filename=config.train_filename,
                     processing_word=processing_word,
                     processing_tag=processing_tag,
                     max_iter=config.max_iter)

model = NERModel(config=config,
                 embeddings=embeddings,
                 ntags=len(vocab_tags),
                 nchars=len(vocab_chars))

コード例 #16

0

ファイルを表示

        label_score = self.hidden2tag(rnn_out)
        label_score = self.dropfinal(label_score)
        return label_score
    
if __name__ == "__main__":
    from data_utils import Data2tensor, Vocab, seqPAD, CoNLLDataset
    train_file='/media/data/NER/conll03/conll03/train.bmes'
    dev_file='/media/data/NER/conll03/conll03/dev.bmes'
    test_file='/media/data/NER/conll03/conll03/test.bmes'
    vocab = Vocab(cutoff=1, wl_th=None, cl_th=None, w_lower=False, w_norm=False, c_lower=False, c_norm=False)
    vocab.build([train_file, dev_file, test_file])
    
    
    word2idx = vocab.wd2idx(vocab_words=vocab.w2i, vocab_chars=vocab.c2i, allow_unk=True, start_end=True)
    tag2idx = vocab.tag2idx(vocab_tags=vocab.l2i, start_end=True)
    train_data = CoNLLDataset(train_file, word2idx=word2idx, tag2idx=tag2idx)
    train_iters = Vocab.minibatches(train_data, batch_size=10)
    data=[]
    label_ids = []
    for words, labels in train_iters:
        char_ids, word_ids = zip(*words)
        data.append(words)
        word_ids, sequence_lengths = seqPAD.pad_sequences(word_ids, pad_tok=0, wthres=1024, cthres=32)
        char_ids, word_lengths = seqPAD.pad_sequences(char_ids, pad_tok=0, nlevels=2, wthres=1024, cthres=32)
        label_ids, label_lengths = seqPAD.pad_sequences(labels, pad_tok=0, wthres=1024, cthres=32)
    
    w_tensor=Data2tensor.idx2tensor(word_ids)
    c_tensor=Data2tensor.idx2tensor(char_ids)
    y_tensor=Data2tensor.idx2tensor(label_ids)
    
    data_tensor = Data2tensor.sort_tensors(label_ids, word_ids, sequence_lengths, char_ids, word_lengths, volatile_flag=False)

コード例 #17

0

ファイルを表示

def build_data(config):
    """
    Procedure to build data

    Args:
        config: defines attributes needed in the function
    Returns:
        creates vocab files from the datasets
        creates a npz embedding file from trimmed glove vectors
    """
    processing_word = get_processing_word(lowercase=True)

    # Generators
    dev = CoNLLDataset(config.dev_filename, processing_word)
    test = CoNLLDataset(config.test_filename, processing_word)
    train = CoNLLDataset(config.train_filename, processing_word)

    # Build Word and Tag vocab
    vocab_words, vocab_tags, vocab_pos = get_vocabs([train, dev,
                                                     test])  #pos adding-----
    vocab_glove = get_glove_vocab(config.glove_filename)
    vocab_dic = get_dic_vocab(config.dic_filename, 1)  #add dic vector get
    vocab_syl = get_dic_vocab(config.syl_filename, 1)  #add syl vector
    vocab_morph = get_morph_vocab(config.morph_vec_filename)  #morph vector get

    vocab = vocab_words & vocab_glove
    vocab.add(UNK.decode('utf-8'))
    vocab.add(NUM.decode('utf-8'))

    word_dic = vocab_dic  #add dic
    word_dic.add(UNK.decode('utf-8'))
    word_dic.add(NUM.decode('utf-8'))

    word_syl = vocab_syl  #add syl
    word_syl.add(UNK.decode('utf-8'))
    word_syl.add(NUM.decode('utf-8'))

    word_morph = vocab_morph  # add morph
    word_morph.add(UNK.decode('utf-8'))
    word_morph.add(NUM.decode('utf-8'))

    vocab_pos.add(UNK.decode('utf-8'))

    # Save vocab
    write_vocab(vocab, config.words_filename)
    write_vocab(vocab_tags, config.tags_filename)
    write_vocab(word_dic, config.word_dic_filename)  #add dic
    write_vocab(word_syl, config.word_syl_filename)  #add syl
    write_vocab(word_morph, config.morphs_filename)  #add morph
    write_vocab(vocab_pos, config.posTag_filename)  #add pos

    # Trim GloVe Vectors(pretrain vector)
    vocab = load_vocab(config.words_filename)
    export_trimmed_glove_vectors(vocab, config.glove_filename,
                                 config.trimmed_filename, config.dim)
    word_dic = load_vocab(config.word_dic_filename)  #dic add
    export_dic_vectors(word_dic, config.dic_filename, config.exported_filename,
                       config.dic_dim)
    word_syl = load_vocab(config.word_syl_filename)  #syl add
    export_syl_vectors(word_syl, config.syl_filename,
                       config.exported_sfilename, config.syl_dim)
    word_morph = load_vocab(config.morphs_filename)  #morph add
    export_morph_vectors(word_morph, config.morph_vec_filename,
                         config.exported_mfilename, config.dim_morph)
    vocab_pos = load_vocab(config.posTag_filename)  #pos add
    export_pos_vectors(vocab_pos, config.pos_vec_filename,
                       config.exported_pfilename, config.dim_pos)

    # Build and save char vocab, morph vocab
    train = CoNLLDataset(config.train_filename)
    vocab_chars = get_char_vocab(train)
    write_vocab(vocab_chars, config.chars_filename)

コード例 #18

0

ファイルを表示

ファイル: main.py プロジェクト: TheDutchDevil/BiDTree

    vocab_relations = load_vocab(config.relations_filename)

    # get processing functions
    processing_word = get_processing_word(vocab_words,
                                          vocab_chars,
                                          lowercase=config.lowercase,
                                          chars=config.chars)
    processing_tag = get_processing_word(vocab_tags, lowercase=False)
    processing_relation = get_processing_relation(vocab_relations)

    # get pre trained embeddings
    embeddings = get_trimmed_glove_vectors(config.trimmed_filename)

    # create dataset
    dev = CoNLLDataset(config.dev_filename,
                       processing_word,
                       processing_tag=processing_tag)
    test = CoNLLDataset(config.test_filename,
                        processing_word,
                        processing_tag=processing_tag)
    train = CoNLLDataset(config.train_filename,
                         processing_word,
                         processing_tag=processing_tag)

    data = [dev, test, train]
    _ = map(len, chain.from_iterable(w for w in (s for s in data)))
    max_sentence_size = max(train.max_words_len, test.max_words_len,
                            dev.max_words_len)
    max_word_size = max(train.max_chars_len, test.max_chars_len,
                        dev.max_chars_len)

コード例 #19

0

ファイルを表示

ファイル: main.py プロジェクト: vutran0230/DOER

    processing_aspect_tag = get_processing_word(vocab_aspect_tags,
                                                lowercase=False)
    processing_polarity_tag = get_processing_word(vocab_polarity_tags,
                                                  lowercase=False)
    processing_joint_tag = get_processing_word(vocab_joint_tags,
                                               lowercase=False)

    # get pre trained embeddings
    domain_embeddings = get_trimmed_glove_vectors(
        config.domain_trimmed_filename)
    general_embeddings = get_trimmed_glove_vectors(
        config.general_trimmed_filename)

    # create dataset
    dev = CoNLLDataset(config.dev_filename, processing_word, processing_pos,
                       processing_chunk, processing_aspect_tag,
                       processing_polarity_tag, processing_joint_tag,
                       config.max_iter)
    test = CoNLLDataset(config.test_filename, processing_word, processing_pos,
                        processing_chunk, processing_aspect_tag,
                        processing_polarity_tag, processing_joint_tag,
                        config.max_iter)
    train = CoNLLDataset(config.train_filename, processing_word,
                         processing_pos, processing_chunk,
                         processing_aspect_tag, processing_polarity_tag,
                         processing_joint_tag, config.max_iter)

    data = [dev, test, train]
    _no_use_ = map(len, chain.from_iterable(w for w in (s for s in data)))
    max_sentence_size = max(train.max_sentence_len, test.max_sentence_len,
                            dev.max_sentence_len)

コード例 #20

0

ファイルを表示

ファイル: data_prep.py プロジェクト: yil532/MAX-Named-Entity-Tagger

# training data
train_filename = "{}/train.txt".format(data_dir)
valid_filename = "{}/valid.txt".format(data_dir)
# glove files
glove_filename = "{}/glove.6B.{}d.txt".format(glove_dir, dim_word)
# trimmed embeddings (created from glove_filename with build_data.py)
filename_trimmed = "{}/glove.6B.{}d.trimmed.npz".format(output_dir, dim_word)

words_filename = "{}/words.txt".format(output_dir)
tags_filename = "{}/tags.txt".format(output_dir)
chars_filename = "{}/chars.txt".format(output_dir)

processing_word = get_processing_word(lowercase=True)

train = CoNLLDataset(train_filename, processing_word)
valid = CoNLLDataset(valid_filename, processing_word)

# Build word and tag vocabs
vocab_words, vocab_tags = get_vocabs([train, valid])
vocab_glove = get_glove_vocab(glove_filename)

vocab = vocab_words & vocab_glove
vocab.add(UNK)
vocab.add(NUM)

# Save vocab
write_vocab(vocab, words_filename)
write_vocab(vocab_tags, tags_filename)

# Trim GloVe Vectors

コード例 #21

0

ファイルを表示

ファイル: main.py プロジェクト: KimByoungjae/klpNER2017

                                      dic_flag=config.dic_flag)
processing_tag = get_processing_word(vocab_tags, lowercase=False)
processing_pos = get_processing_word(pos_tags=pos_tags,
                                     posflag=True,
                                     lowercase=True,
                                     pos_lm=True)

# get pre trained embeddings
embeddings = get_trimmed_glove_vectors(config.trimmed_filename)
dic_embeddings = get_exported_dic_vectors(config.exported_filename)
morph_embeddings = get_exported_morph_vectors(config.exported_mfilename)
syl_embeddings = get_exported_dic_vectors(config.exported_sfilename)
pos_embeddings = get_exported_pos_vectors(config.exported_pfilename)

# create dataset
dev = CoNLLDataset(config.dev_filename, processing_word, processing_tag,
                   processing_pos, config.max_iter)
test = CoNLLDataset(config.test_filename, processing_word, processing_tag,
                    processing_pos, config.max_iter)
train = CoNLLDataset(config.train_filename, processing_word, processing_tag,
                     processing_pos, config.max_iter)

# build model
lmwords = len(vocab_words)
lmposs = len(pos_tags)

model = NERModel(config,
                 embeddings,
                 dic_embeddings,
                 pos_embeddings,
                 syl_embeddings,
                 morph_embeddings,

コード例 #22

0

ファイルを表示

ファイル: train_ner.py プロジェクト: yil532/MAX-Named-Entity-Tagger

use_chars = True
max_iter = None

print('Loading vocab files and word vectors from {}'.format(data_dir))
vocab_tags = load_vocab("{}/assets/tags.txt".format(data_dir))
vocab_chars = load_vocab("{}/assets/chars.txt".format(data_dir))
vocab_words = load_vocab("{}/assets/words.txt".format(data_dir))

n_words = len(vocab_words)
n_char = len(vocab_chars)
n_tags = len(vocab_tags)
pad_tag = n_tags
n_labels = n_tags + 1

# coNLL data for train
train = CoNLLDataset(train_filename, get_processing_word(vocab_words, vocab_chars, lowercase=True, chars=use_chars),
                     get_processing_word(vocab_tags, lowercase=False, allow_unk=False), max_iter)
# coNLL data for validation#coNLL
valid = CoNLLDataset(valid_filename, get_processing_word(vocab_words, vocab_chars, lowercase=True, chars=use_chars),
                     get_processing_word(vocab_tags, lowercase=False, allow_unk=False), max_iter)

emb_data = np.load("{}/assets/glove.6B.300d.trimmed.npz".format(data_dir))
embeddings = emb_data["embeddings"]

# Hyperparameters
dim_word = 300
dim_char = 100
hidden_size_char = 100  # lstm on chars
hidden_size_lstm = 300  # lstm on word embeddings
nepochs = args.epochs
lr = 0.0105
lr_decay = 0.0005