Beispiel #1
0
def main(config):
    # load vocabs
    vocab_words = load_vocab(config.words_filename)
    vocab_tags  = load_vocab(config.tags_filename)
    vocab_chars = load_vocab(config.chars_filename)

    # get processing functions
    processing_word = get_processing_word(vocab_words, vocab_chars,
                    lowercase=True, chars=config.chars)
    processing_tag  = get_processing_word(vocab_tags, 
                    lowercase=False, allow_unk=False)

    # get pre trained embeddings
    embeddings = get_trimmed_glove_vectors(config.trimmed_filename)

    # create dataset
    dev   = CoNLLDataset(config.dev_filename, processing_word,
                        processing_tag, config.max_iter)
    test  = CoNLLDataset(config.test_filename, processing_word,
                        processing_tag, config.max_iter)
    train = CoNLLDataset(config.train_filename, processing_word,
                        processing_tag, config.max_iter)

    # build model
    model = NERModel(config, embeddings, ntags=len(vocab_tags),
                                         nchars=len(vocab_chars))
    model.build()

    # train, evaluate and interact
    model.train(train, dev, vocab_tags)
    model.evaluate(test, vocab_tags)
    model.interactive_shell(vocab_tags, processing_word)
Beispiel #2
0
def build_data(config):
    """
    Procedure to build data
    Args:
        config: defines attributes needed in the function
    Returns:
        creates vocab files from the datasets
        creates a npz embedding file from trimmed glove vectors
    """
    processing_word = get_processing_word(lowercase=config.lowercase)

    # Generators
    dev = CoNLLDataset(config.dev_filename, processing_word)
    test = CoNLLDataset(config.test_filename, processing_word)
    train = CoNLLDataset(config.train_filename, processing_word)

    # Build Word and Tag vocab
    vocab_words, vocab_tags = get_vocabs([train, dev, test])
    vocab_glove = get_glove_vocab(config.glove_filename)

    vocab = vocab_words & vocab_glove
    vocab.add(UNK)
    vocab.add(NUM)

    # Save vocab
    write_vocab(vocab, config.words_filename)
    write_vocab(vocab_tags, config.tags_filename)

    # Trim GloVe Vectors
    vocab = load_vocab(config.words_filename)
    export_trimmed_glove_vectors(vocab, config.glove_filename,
                                 config.trimmed_filename, config.dim)
Beispiel #3
0
def build_data(config):
    processing_word = get_processing_word()

    dev = CoNLLDataset(config.dev_filename, processing_word)
    test = CoNLLDataset(config.test_filename, processing_word)
    train = CoNLLDataset(config.train_filename, processing_word)

    vocab_words, vocab_tags, vocab_poss = get_vocabs([train, dev, test])
    vocab_glove = get_glove_vocab(config.glove_filename)

    vocab = vocab_words & vocab_glove
    vocab.add(UNK)
    vocab.add(NUM)

    write_vocab(vocab, config.words_filename)
    write_vocab(vocab_tags, config.tags_filename)
    write_vocab(vocab_poss, config.poss_filename)

    vocab = load_vocab(config.words_filename)
    export_trimmed_glove_vectors(vocab, config.glove_filename,
                                 config.trimmed_filename, config.dim)

    train = CoNLLDataset(config.train_filename)
    vocab_chars = get_char_vocab(train)
    write_vocab(vocab_chars, config.chars_filename)
Beispiel #4
0
def build_data(config):
    """
    Procedure to build data

    Args:
        config: defines attributes needed in the function
    Returns:
        creates vocab files from the datasets
        creates a npz embedding file from trimmed glove vectors
    """
    processing_word = get_processing_word(lowercase=True)

    # Generators
    dev = CoNLLDataset(config.dev_filename, processing_word)
    #test  = CoNLLDataset(config.test_filename, processing_word)
    train = CoNLLDataset(config.train_filename, processing_word)

    # Build Word and Tag vocab
    vocab_words, vocab_tags, vocab_pos = get_vocabs([train, dev])
    vocab_glove = get_glove_vocab(config.glove_filename)
    vocab_glove_uni = get_glove_vocab(config.glove_uni_filename)

    vocab_feature = get_pos_glove_vocab(config.glove_filename)

    # vocab = vocab_words & vocab_glove
    vocab = vocab_glove | vocab_words
    vocab.add(UNK)
    vocab.add(NUM)

    vocab_pos = vocab_feature
    vocab_pos.add(UNK)
    vocab_pos.add(NUM)

    # Save vocab
    write_vocab(vocab, config.words_filename)
    write_vocab(vocab_glove_uni, config.uni_words_filename)
    write_vocab(vocab_tags, config.tags_filename)
    write_vocab(vocab_pos, config.pos_filename)

    # Trim GloVe Vectors
    vocab = load_vocab(config.words_filename)
    export_trimmed_glove_vectors(vocab, config.glove_filename,
                                 config.trimmed_filename, config.t_dim)

    vocab = load_vocab(config.uni_words_filename)

    export_trimmed_uni_vectors(vocab, config.NEdic_filename,
                               config.trimmed_dic, config.dic_dim)

    export_trimmed_uni_vectors(vocab, config.glove_uni_filename,
                               config.uni_trimmed_filename, config.dim)

    vocab_feature = load_vocab(config.pos_filename)
    export_trimmed_pos_vectors(vocab_feature, config.glove_feature,
                               config.feature_trimmed_filename, config.pos_dim)

    # Build and save char vocab
    train = CoNLLDataset(config.train_filename)
    vocab_chars = get_char_vocab(train)
    write_vocab(vocab_chars, config.chars_filename)
Beispiel #5
0
def main(config):
    # load vocabs
    vocab_words, idx2words = load_vocab(config.words_filename)
    vocab_tags, _  = load_vocab(config.tags_filename)
    vocab_chars, _ = load_vocab(config.chars_filename)
    vocab_pos, _ = load_vocab(config.pos_filename)


    # get processing functions
    processing_word = get_processing_word(vocab_words, vocab_chars,
                    lowercase=True, chars=config.chars)

    processing_tag  = get_processing_word(vocab_tags, 
                    lowercase=False)

    processing_pos = get_processing_word(vocab_pos,
                                         lowercase=False)




    # get pre trained embeddings
    embeddings = get_trimmed_glove_vectors(config.trimmed_filename)
    embeddings_uni = get_trimmed_glove_vectors(config.uni_trimmed_filename)
    pos_embeddings = get_trimmed_glove_vectors(config.feature_trimmed_filename)
    NE_dic = get_trimmed_glove_vectors(config.trimmed_dic)


    # create dataset
    dev   = CoNLLDataset(config.dev_filename, processing_word,
                        processing_tag, processing_pos, config.max_iter)

    train = CoNLLDataset(config.train_filename, processing_word,
                        processing_tag, processing_pos, config.max_iter)
    
    # build model
    model = NERModel(config, embeddings, embeddings_uni,
                     pos_embeddings, ntags=len(vocab_tags), nchars=len(vocab_chars), vocab_words=idx2words,
                    NE_dic=NE_dic)
    model.build()

    # train, evaluate and interact
    if state == "train":
        model.train(train, dev, vocab_tags)

    elif state == "evaluate":
        model.evaluate(dev, vocab_tags)

    else: #state == predict
        convert(file)
        t2o("data_format/test_convert.txt","data_format/test.txt")
        test = CoNLLDataset(config.test_filename, processing_word,
                            processing_tag, processing_pos, config.max_iter)

        model.evaluate(test, vocab_tags)

        tagging("data_format/test_convert.txt")
Beispiel #6
0
def build_data(config, logger):
    """
    Procedure to build data
    """
    processing_word = get_processing_word(lowercase=config.lowercase)

    # Generators
    test = CoNLLDataset(config.test_filename, processing_word)
    dev = CoNLLDataset(config.dev_filename, processing_word)
    train = CoNLLDataset(config.train_filename, processing_word)

    # Build Word and Tag vocab
    print("Build Word and Tag vocab...")
    vocab_words, vocab_poss, vocab_chunks, \
    vocab_aspect_tags, vocab_polarity_tags, vocab_joint_tags = get_vocabs([train, dev, test])
    vocab = vocab_words
    vocab.add(UNK)
    vocab.add(NUM)

    # Save vocab
    print("Dealing words vocab...")
    write_vocab(vocab, config.words_filename)
    print("Dealing poss vocab...")
    write_vocab(vocab_poss, config.poss_filename)

    vocab_chunks = [tags for tags in vocab_chunks]
    if "NO" in vocab_chunks:
        vocab_chunks.remove("NO")
        vocab_chunks.insert(0, "NO")
    else:
        logger.error(">>> vocab_chunks used as mpqa has something wrong!")
    print("Dealing chunks vocab...")
    write_vocab(vocab_chunks, config.chunks_filename)

    vocab_aspect_tags = [tags for tags in vocab_aspect_tags]
    vocab_aspect_tags.remove("O")
    vocab_aspect_tags.insert(0, "O")
    vocab_polarity_tags = [tags for tags in vocab_polarity_tags]
    vocab_polarity_tags.remove("O")
    vocab_polarity_tags.insert(0, "O")
    vocab_joint_tags = [tags for tags in vocab_joint_tags]
    vocab_joint_tags.remove("O")
    vocab_joint_tags.insert(0, "O")
    print("Dealing aspect_tags vocab...")
    write_vocab(vocab_aspect_tags, config.aspect_tags_filename)
    print("Dealing polarity_tags vocab...")
    write_vocab(vocab_polarity_tags, config.polarity_tags_filename)
    print("Dealing joint_tags vocab...")
    write_vocab(vocab_joint_tags, config.joint_tags_filename)

    vocab = load_vocab(config.words_filename)
    export_trimmed_glove_vectors(vocab, config.domain_filename,
                                 config.domain_trimmed_filename,
                                 config.dim_domain)
    export_trimmed_glove_vectors(vocab, config.general_filename,
                                 config.general_trimmed_filename,
                                 config.dim_general)
Beispiel #7
0
def build_data(config, logger):
    """
    Procedure to build data
    """

    # Generators
    processing_word = get_processing_word(lowercase=config.lowercase)
    test = CoNLLDataset(config.test_filename, processing_word)
    dev = CoNLLDataset(config.dev_filename, processing_word)
    train = CoNLLDataset(config.train_filename, processing_word)

    # Build Word and Tag vocab
    logger.info("Build Word and Tag vocab...")
    vocab_words, vocab_poss, vocab_chunks, vocab_tags = get_vocabs(
        [train, dev, test])
    vocab = vocab_words
    vocab.add(UNK)
    vocab.add(NUM)

    # Save vocab
    write_vocab(vocab, config.words_filename)
    vocab_tags = [tags for tags in vocab_tags]
    vocab_tags.remove("O")
    vocab_tags.insert(0, "O")
    write_vocab(vocab_tags, config.tags_filename)

    # Trim GloVe Vectors
    vocab = load_vocab(config.words_filename)
    export_trimmed_glove_vectors(vocab, config.glove_filename,
                                 config.trimmed_filename, config.dim)

    # Build and save char vocab
    logger.info("Build chars vocab...")
    train = CoNLLDataset(config.train_filename)
    vocab_chars = get_char_vocab(train)
    write_vocab(vocab_chars, config.chars_filename)

    # Build and save Depstree
    processing_relation = get_processing_relation()
    dev_deps = DepsDataset(config.dev_deps_filename, processing_word,
                           processing_relation)
    train_deps = DepsDataset(config.train_deps_filename, processing_word,
                             processing_relation)

    logger.info("Build relations vocab...")
    vocab_relations = get_relations_vocabs([train_deps, dev_deps])
    vocab_relations.add(UNK)
    write_vocab(vocab_relations, config.relations_filename)
Beispiel #8
0
def main(config):
    # load vocabs
    vocab_words = load_vocab(config.words_filename)
    vocab_tags = load_vocab(config.tags_filename)
    vocab_chars = load_vocab(config.chars_filename)
    vocab_iob = {"O": 0, "B": 1, "I": 2}
    vocab_type = {"LOC": 0, "PER": 1, "ORG": 2, "MISC": 3}

    # get processing functions
    processing_word = get_processing_word(vocab_words,
                                          vocab_chars,
                                          lowercase=True,
                                          chars=config.chars)
    processing_tag = get_processing_word(vocab_tags, lowercase=False)
    processing_iob = get_processing_word(vocab_iob, lowercase=False)
    processing_type = get_processing_word(vocab_type, lowercase=False)

    # get pre trained embeddings
    embeddings = get_trimmed_glove_vectors(config.trimmed_filename)

    # create dataset
    dev = CoNLLDataset(config.dev_filename, processing_word, processing_tag,
                       processing_iob, processing_type, config.max_iter,
                       config.chars)
    test = CoNLLDataset(config.test_filename, processing_word, processing_tag,
                        processing_iob, processing_type, config.max_iter,
                        config.chars)
    train = CoNLLDataset(config.train_filename, processing_word,
                         processing_tag, processing_iob, processing_type,
                         config.max_iter, config.chars)

    model = NERModel(config,
                     embeddings,
                     ntags=len(vocab_tags),
                     nchars=len(vocab_chars),
                     niob=3,
                     ntype=4)

    model.build()

    # train, evaluate and interact
    print vocab_tags
    model.train(train, dev, vocab_tags)

    stime = time.time()
    model.evaluate(test, vocab_tags)
    print time.time() - stime
def build_data(config):
    """
    Procedure to build data

    Args:
        config: defines attributes needed in the function
    Returns:
        creates vocab files from the datasets
        creates a npz embedding file from trimmed glove vectors
    """
    processing_word = get_processing_word(lowercase=True)

    # Generators
    dev   = CoNLLDataset(config.dev_filename, processing_word)
    test  = CoNLLDataset(config.test_filename, processing_word)
    train = CoNLLDataset(config.train_filename, processing_word)

    # Build Word and Tag vocab
    vocab_words, vocab_tags = get_vocabs([train, dev, test])
    vocab_glove = get_glove_vocab(config.glove_filename)
    vocab = vocab_words & vocab_glove
    vocab.add(UNK)
    vocab.add(NUM)
    vocab.add(PAD)

    # Save vocab
    write_vocab(vocab, config.words_filename)
    write_vocab(vocab_tags, config.tags_filename)

    # Trim GloVe Vectors
    vocab = load_vocab(config.words_filename)
    export_trimmed_glove_vectors(vocab, config.glove_filename, 
                                config.trimmed_filename, config.dim)

    # Build and save char vocab
    train = CoNLLDataset(config.train_filename, processing_word)
    vocab_chars = get_char_vocab(train)
    write_vocab(vocab_chars, config.chars_filename)


    # Build and save type vocab
    vocab_types = set()
    print len(vocab_tags)
    for tag in vocab_tags:
        if tag != 'O':
            vocab_types.add(tag[2:])
    write_vocab(vocab_types, config.types_filename)
Beispiel #10
0
def main(config):
    # load vocabs
    vocab_words = load_vocab(config.words_filename)
    vocab_tags = load_vocab(config.tags_filename)
    vocab_chars = load_vocab(config.chars_filename)
    vocab_pref_suff = load_vocab(
        config.PS_filename)  ############### For prefix and suffix
    vocab_pref_suff_2 = load_vocab(config.PS_filename_2)
    vocab_pref_suff_4 = load_vocab(config.PS_filename_4)
    # get processing functions
    processing_word = get_processing_word(vocab_words,
                                          vocab_chars,
                                          vocab_pref_suff,
                                          vocab_pref_suff_2,
                                          vocab_pref_suff_4,
                                          lowercase=True,
                                          chars=config.chars,
                                          Pref_Suff=config.pref_suff)
    processing_tag = get_processing_word(vocab_tags,
                                         lowercase=False,
                                         Geoparser=True)

    # get pre trained embeddings
    embeddings = get_trimmed_glove_vectors(config.trimmed_filename)

    ##create dataset
    dev = CoNLLDataset(
        config.dev_filename,
        processing_word,  ############ Here dev, test and train have the raw words and tags. Now we have to map these to corresponding word index
        processing_tag,
        config.max_iter
    )  ############ and tags index. Therefore, when we do model.evaluate in below lines, it calls run_evaluate in run_epoch function
    test = CoNLLDataset(config.test_filename, processing_word, processing_tag,
                        config.max_iter)
    train = CoNLLDataset(config.train_filename, processing_word,
                         processing_tag, config.max_iter)

    # build model
    model = NERModel(config,
                     embeddings,
                     ntags=len(vocab_tags),
                     nchars=len(vocab_chars))
    model.build()

    # train, evaluate and interact
    model.train(train, dev, vocab_tags)
    model.evaluate(test, vocab_tags)
Beispiel #11
0
def train(config_path,continue_=False):
    #reading hyperparameters
    config_params = json.load(open(config_path))
    
    #loading hyperparams
    config_data = config(**config_params, load=False)
    #Creating data vocab.txt, chars.txt, tags.txt, and embeddings
    data_builder(config_data)
    
    #creating loading the data created earlier
    config_train = config(**config_params,load=True)
    
    #build model
    model = BILSTM_CRF(config_train)
    model.build()
    
    if continue_:
        try:
            model_path = config_params["model_path"]
            print("Loading weights from path:: ",model_path)
            model.restore_session(model_path)
            model.reinitialize_weights("proj")
            print("Restoring weights")
        except:
            print("Restoring weights failed")
            print("training from scratch")
            print(e)
            input()
    
    
    
    #data generators
    dev   = CoNLLDataset(config_train.train, config_train.process_words,
                         config_train.process_tags)
    train = CoNLLDataset(config_train.test, config_train.process_words,
                         config_train.process_tags)
    
    # train model
    model.train(train, dev)
    
    print("Trainig Complete!")
    print("Remove the events.tf files from the output directory if you don't need them. Note that removing them won't affect the predictions in anyway")
def main(config):
    # load vocabs
    vocab_words = load_vocab(config.words_filename)
    vocab_tags  = load_vocab(config.tags_filename)
    vocab_chars = load_vocab(config.chars_filename)
    dictionary = load_vocab("data/types.txt")
    types_dic = collections.OrderedDict([(v, k) for k, v in dictionary.items()])
    vocab_iob = {"O":0, "B":1, "I":2}
    vocab_type = load_vocab(config.types_filename)
    print vocab_type
    # get processing functions
    processing_word = get_processing_word(vocab_words, vocab_chars,
                    lowercase=True, chars=config.chars)
    processing_tag  = get_processing_word(vocab_tags, 
                    lowercase=False)
    processing_iob = get_processing_word(vocab_iob, 
                    lowercase=False)
    processing_type = get_processing_word(vocab_type, 
                    lowercase=False)

    # get pre trained embeddings
    embeddings = get_trimmed_glove_vectors(config.trimmed_filename)

    # create dataset
    dev   = CoNLLDataset(config.dev_filename, processing_word,
                        processing_tag, processing_iob, processing_type, config.max_iter, config.chars)
    test  = CoNLLDataset(config.test_filename, processing_word,
                        processing_tag, processing_iob, processing_type, config.max_iter, config.chars)
    train = CoNLLDataset(config.train_filename, processing_word,
                        processing_tag, processing_iob, processing_type, config.max_iter, config.chars)

    ntype = len(vocab_type)
    model = POSmodel(config, embeddings, ntags=len(vocab_tags),
                                         nchars=len(vocab_chars),
                                         niob=3,
                                         ntype=ntype)

    model.build()

    model.train(train, dev, vocab_type)

    model.evaluate(test, vocab_type)
def main(pretrained_embeddings_file=None,
         filtered_embeddings_file="data/filtered_embeddings.txt"):
    words_file = "data/words.txt"
    tags_file = "data/tags.txt"
    chars_file = "data/chars.txt"
    test_file = 'data/eng.testa'
    train_file = 'data/eng.train'

    processing_word = get_processing_word(lowercase=False)

    test = CoNLLDataset(test_file, processing_word)
    train = CoNLLDataset(train_file, processing_word)

    vocab_words, vocab_tags = get_vocabs([train, test])
    vocab = set(vocab_words)
    if pretrained_embeddings_file:
        embedding_vocab = get_embedding_vocab(pretrained_embeddings_file)
        vocab &= embedding_vocab
        print('{} overlapping words'.format(len(vocab)))

    vocab.add(UNK)
    vocab.add(NUM)
    vocab = list(vocab)
    # TODO: there's probably no need for these anymore, check and remove, if this is the case
    vocab.insert(TOKEN2IDX[PAD], PAD)
    vocab.insert(TOKEN2IDX[START_TAG], START_TAG)
    vocab.insert(TOKEN2IDX[STOP_TAG], STOP_TAG)
    print(len(vocab))

    write_vocab(vocab, words_file)
    write_vocab(vocab_tags, tags_file)

    if pretrained_embeddings_file:
        filter_embeddings_in_vocabulary(words_file, pretrained_embeddings_file,
                                        filtered_embeddings_file)

    vocab_chars = get_char_vocab(vocab_words)
    write_vocab(vocab_chars, chars_file)
Beispiel #14
0
def main():
    """Procedure to build data

    You MUST RUN this procedure. It iterates over the whole dataset (train,
    dev and test) and extract the vocabularies in terms of words, tags, and
    characters. Having built the vocabularies it writes them in a file. The
    writing of vocabulary in a file assigns an id (the line #) to each word.
    It then extract the relevant GloVe vectors and stores them in a np array
    such that the i-th entry corresponds to the i-th word in the vocabulary.


    Args:
        config: (instance of Config) has attributes like hyper-params...

    """
    # get config and processing of words
    config = Config(load=False)
    processing_word = get_processing_word()

    # Generators
    dev = CoNLLDataset(config.filename_dev, processing_word)
    test = CoNLLDataset(config.filename_test, processing_word)
    train = CoNLLDataset(config.filename_train, processing_word)
Beispiel #15
0
from model import NERModel
from config import Config

config = Config()

vocab_words = load_vocab(config.words_filename)  # words idx
vocab_tags = load_vocab(config.tags_filename)  # tags idx
vocab_chars = load_vocab(config.chars_filename)  # char idx

processing_word = get_processing_word(vocab_words, vocab_chars)
processing_tag = get_processing_word(vocab_tags)

embeddings = get_trimmed_glove_vectors(config.trimmed_filename)

dev = CoNLLDataset(filename=config.dev_filename,
                   processing_word=processing_word,
                   processing_tag=processing_tag,
                   max_iter=config.max_iter)
test = CoNLLDataset(filename=config.test_filename,
                    processing_word=processing_word,
                    processing_tag=processing_tag,
                    max_iter=config.max_iter)
train = CoNLLDataset(filename=config.train_filename,
                     processing_word=processing_word,
                     processing_tag=processing_tag,
                     max_iter=config.max_iter)

model = NERModel(config=config,
                 embeddings=embeddings,
                 ntags=len(vocab_tags),
                 nchars=len(vocab_chars))
Beispiel #16
0
        label_score = self.hidden2tag(rnn_out)
        label_score = self.dropfinal(label_score)
        return label_score
    
if __name__ == "__main__":
    from data_utils import Data2tensor, Vocab, seqPAD, CoNLLDataset
    train_file='/media/data/NER/conll03/conll03/train.bmes'
    dev_file='/media/data/NER/conll03/conll03/dev.bmes'
    test_file='/media/data/NER/conll03/conll03/test.bmes'
    vocab = Vocab(cutoff=1, wl_th=None, cl_th=None, w_lower=False, w_norm=False, c_lower=False, c_norm=False)
    vocab.build([train_file, dev_file, test_file])
    
    
    word2idx = vocab.wd2idx(vocab_words=vocab.w2i, vocab_chars=vocab.c2i, allow_unk=True, start_end=True)
    tag2idx = vocab.tag2idx(vocab_tags=vocab.l2i, start_end=True)
    train_data = CoNLLDataset(train_file, word2idx=word2idx, tag2idx=tag2idx)
    train_iters = Vocab.minibatches(train_data, batch_size=10)
    data=[]
    label_ids = []
    for words, labels in train_iters:
        char_ids, word_ids = zip(*words)
        data.append(words)
        word_ids, sequence_lengths = seqPAD.pad_sequences(word_ids, pad_tok=0, wthres=1024, cthres=32)
        char_ids, word_lengths = seqPAD.pad_sequences(char_ids, pad_tok=0, nlevels=2, wthres=1024, cthres=32)
        label_ids, label_lengths = seqPAD.pad_sequences(labels, pad_tok=0, wthres=1024, cthres=32)
    
    w_tensor=Data2tensor.idx2tensor(word_ids)
    c_tensor=Data2tensor.idx2tensor(char_ids)
    y_tensor=Data2tensor.idx2tensor(label_ids)
    
    data_tensor = Data2tensor.sort_tensors(label_ids, word_ids, sequence_lengths, char_ids, word_lengths, volatile_flag=False)
Beispiel #17
0
def build_data(config):
    """
    Procedure to build data

    Args:
        config: defines attributes needed in the function
    Returns:
        creates vocab files from the datasets
        creates a npz embedding file from trimmed glove vectors
    """
    processing_word = get_processing_word(lowercase=True)

    # Generators
    dev = CoNLLDataset(config.dev_filename, processing_word)
    test = CoNLLDataset(config.test_filename, processing_word)
    train = CoNLLDataset(config.train_filename, processing_word)

    # Build Word and Tag vocab
    vocab_words, vocab_tags, vocab_pos = get_vocabs([train, dev,
                                                     test])  #pos adding-----
    vocab_glove = get_glove_vocab(config.glove_filename)
    vocab_dic = get_dic_vocab(config.dic_filename, 1)  #add dic vector get
    vocab_syl = get_dic_vocab(config.syl_filename, 1)  #add syl vector
    vocab_morph = get_morph_vocab(config.morph_vec_filename)  #morph vector get

    vocab = vocab_words & vocab_glove
    vocab.add(UNK.decode('utf-8'))
    vocab.add(NUM.decode('utf-8'))

    word_dic = vocab_dic  #add dic
    word_dic.add(UNK.decode('utf-8'))
    word_dic.add(NUM.decode('utf-8'))

    word_syl = vocab_syl  #add syl
    word_syl.add(UNK.decode('utf-8'))
    word_syl.add(NUM.decode('utf-8'))

    word_morph = vocab_morph  # add morph
    word_morph.add(UNK.decode('utf-8'))
    word_morph.add(NUM.decode('utf-8'))

    vocab_pos.add(UNK.decode('utf-8'))

    # Save vocab
    write_vocab(vocab, config.words_filename)
    write_vocab(vocab_tags, config.tags_filename)
    write_vocab(word_dic, config.word_dic_filename)  #add dic
    write_vocab(word_syl, config.word_syl_filename)  #add syl
    write_vocab(word_morph, config.morphs_filename)  #add morph
    write_vocab(vocab_pos, config.posTag_filename)  #add pos

    # Trim GloVe Vectors(pretrain vector)
    vocab = load_vocab(config.words_filename)
    export_trimmed_glove_vectors(vocab, config.glove_filename,
                                 config.trimmed_filename, config.dim)
    word_dic = load_vocab(config.word_dic_filename)  #dic add
    export_dic_vectors(word_dic, config.dic_filename, config.exported_filename,
                       config.dic_dim)
    word_syl = load_vocab(config.word_syl_filename)  #syl add
    export_syl_vectors(word_syl, config.syl_filename,
                       config.exported_sfilename, config.syl_dim)
    word_morph = load_vocab(config.morphs_filename)  #morph add
    export_morph_vectors(word_morph, config.morph_vec_filename,
                         config.exported_mfilename, config.dim_morph)
    vocab_pos = load_vocab(config.posTag_filename)  #pos add
    export_pos_vectors(vocab_pos, config.pos_vec_filename,
                       config.exported_pfilename, config.dim_pos)

    # Build and save char vocab, morph vocab
    train = CoNLLDataset(config.train_filename)
    vocab_chars = get_char_vocab(train)
    write_vocab(vocab_chars, config.chars_filename)
Beispiel #18
0
    vocab_relations = load_vocab(config.relations_filename)

    # get processing functions
    processing_word = get_processing_word(vocab_words,
                                          vocab_chars,
                                          lowercase=config.lowercase,
                                          chars=config.chars)
    processing_tag = get_processing_word(vocab_tags, lowercase=False)
    processing_relation = get_processing_relation(vocab_relations)

    # get pre trained embeddings
    embeddings = get_trimmed_glove_vectors(config.trimmed_filename)

    # create dataset
    dev = CoNLLDataset(config.dev_filename,
                       processing_word,
                       processing_tag=processing_tag)
    test = CoNLLDataset(config.test_filename,
                        processing_word,
                        processing_tag=processing_tag)
    train = CoNLLDataset(config.train_filename,
                         processing_word,
                         processing_tag=processing_tag)

    data = [dev, test, train]
    _ = map(len, chain.from_iterable(w for w in (s for s in data)))
    max_sentence_size = max(train.max_words_len, test.max_words_len,
                            dev.max_words_len)
    max_word_size = max(train.max_chars_len, test.max_chars_len,
                        dev.max_chars_len)
Beispiel #19
0
    processing_aspect_tag = get_processing_word(vocab_aspect_tags,
                                                lowercase=False)
    processing_polarity_tag = get_processing_word(vocab_polarity_tags,
                                                  lowercase=False)
    processing_joint_tag = get_processing_word(vocab_joint_tags,
                                               lowercase=False)

    # get pre trained embeddings
    domain_embeddings = get_trimmed_glove_vectors(
        config.domain_trimmed_filename)
    general_embeddings = get_trimmed_glove_vectors(
        config.general_trimmed_filename)

    # create dataset
    dev = CoNLLDataset(config.dev_filename, processing_word, processing_pos,
                       processing_chunk, processing_aspect_tag,
                       processing_polarity_tag, processing_joint_tag,
                       config.max_iter)
    test = CoNLLDataset(config.test_filename, processing_word, processing_pos,
                        processing_chunk, processing_aspect_tag,
                        processing_polarity_tag, processing_joint_tag,
                        config.max_iter)
    train = CoNLLDataset(config.train_filename, processing_word,
                         processing_pos, processing_chunk,
                         processing_aspect_tag, processing_polarity_tag,
                         processing_joint_tag, config.max_iter)

    data = [dev, test, train]
    _no_use_ = map(len, chain.from_iterable(w for w in (s for s in data)))
    max_sentence_size = max(train.max_sentence_len, test.max_sentence_len,
                            dev.max_sentence_len)
# training data
train_filename = "{}/train.txt".format(data_dir)
valid_filename = "{}/valid.txt".format(data_dir)
# glove files
glove_filename = "{}/glove.6B.{}d.txt".format(glove_dir, dim_word)
# trimmed embeddings (created from glove_filename with build_data.py)
filename_trimmed = "{}/glove.6B.{}d.trimmed.npz".format(output_dir, dim_word)

words_filename = "{}/words.txt".format(output_dir)
tags_filename = "{}/tags.txt".format(output_dir)
chars_filename = "{}/chars.txt".format(output_dir)

processing_word = get_processing_word(lowercase=True)

train = CoNLLDataset(train_filename, processing_word)
valid = CoNLLDataset(valid_filename, processing_word)

# Build word and tag vocabs
vocab_words, vocab_tags = get_vocabs([train, valid])
vocab_glove = get_glove_vocab(glove_filename)

vocab = vocab_words & vocab_glove
vocab.add(UNK)
vocab.add(NUM)

# Save vocab
write_vocab(vocab, words_filename)
write_vocab(vocab_tags, tags_filename)

# Trim GloVe Vectors
Beispiel #21
0
                                      dic_flag=config.dic_flag)
processing_tag = get_processing_word(vocab_tags, lowercase=False)
processing_pos = get_processing_word(pos_tags=pos_tags,
                                     posflag=True,
                                     lowercase=True,
                                     pos_lm=True)

# get pre trained embeddings
embeddings = get_trimmed_glove_vectors(config.trimmed_filename)
dic_embeddings = get_exported_dic_vectors(config.exported_filename)
morph_embeddings = get_exported_morph_vectors(config.exported_mfilename)
syl_embeddings = get_exported_dic_vectors(config.exported_sfilename)
pos_embeddings = get_exported_pos_vectors(config.exported_pfilename)

# create dataset
dev = CoNLLDataset(config.dev_filename, processing_word, processing_tag,
                   processing_pos, config.max_iter)
test = CoNLLDataset(config.test_filename, processing_word, processing_tag,
                    processing_pos, config.max_iter)
train = CoNLLDataset(config.train_filename, processing_word, processing_tag,
                     processing_pos, config.max_iter)

# build model
lmwords = len(vocab_words)
lmposs = len(pos_tags)

model = NERModel(config,
                 embeddings,
                 dic_embeddings,
                 pos_embeddings,
                 syl_embeddings,
                 morph_embeddings,
use_chars = True
max_iter = None

print('Loading vocab files and word vectors from {}'.format(data_dir))
vocab_tags = load_vocab("{}/assets/tags.txt".format(data_dir))
vocab_chars = load_vocab("{}/assets/chars.txt".format(data_dir))
vocab_words = load_vocab("{}/assets/words.txt".format(data_dir))

n_words = len(vocab_words)
n_char = len(vocab_chars)
n_tags = len(vocab_tags)
pad_tag = n_tags
n_labels = n_tags + 1

# coNLL data for train
train = CoNLLDataset(train_filename, get_processing_word(vocab_words, vocab_chars, lowercase=True, chars=use_chars),
                     get_processing_word(vocab_tags, lowercase=False, allow_unk=False), max_iter)
# coNLL data for validation#coNLL
valid = CoNLLDataset(valid_filename, get_processing_word(vocab_words, vocab_chars, lowercase=True, chars=use_chars),
                     get_processing_word(vocab_tags, lowercase=False, allow_unk=False), max_iter)

emb_data = np.load("{}/assets/glove.6B.300d.trimmed.npz".format(data_dir))
embeddings = emb_data["embeddings"]

# Hyperparameters
dim_word = 300
dim_char = 100
hidden_size_char = 100  # lstm on chars
hidden_size_lstm = 300  # lstm on word embeddings
nepochs = args.epochs
lr = 0.0105
lr_decay = 0.0005