def main(config):
    # load vocabs
    vocab_words = load_vocab(config.words_filename)
    vocab_mor_tags = load_vocab(config.mor_tags_filename)
    vocab_tags = load_vocab(config.tags_filename)
    vocab_chars = load_vocab(config.chars_filename)
    vocab_lex_tags = load_vocab(config.lex_tags_filename)

    # get processing functions
    processing_word = get_processing_word(vocab_words,
                                          vocab_chars,
                                          lowercase=True,
                                          chars=config.chars)
    processing_mor_tag = get_processing_word(vocab_mor_tags, lowercase=False)
    processing_tag = get_processing_word(vocab_tags, lowercase=False)
    processing_lex_tag = get_processing_word(vocab_lex_tags, lowercase=False)

    # get pre trained embeddings
    embeddings = get_trimmed_glove_vectors(config.trimmed_filename)

    cnn_model = CnnLstmCrfModel(config,
                                embeddings,
                                ntags=len(vocab_tags),
                                nchars=len(vocab_chars))
    cnn_model.build()
    cnn_model.write_tag_result_test(vocab_tags, processing_word,
                                    processing_mor_tag, processing_lex_tag)
Exemple #2
0
def main(config):
    # load vocabs
    vocab_words = load_vocab(config.words_filename)
    vocab_mor_tags = load_vocab(config.mor_tags_filename)
    vocab_tags = load_vocab(config.tags_filename)
    vocab_chars = load_vocab(config.chars_filename)
    vocab_lex_tags = load_vocab(config.lex_tags_filename)

    # get processing functions
    processing_word = get_processing_word(vocab_words,
                                          vocab_chars,
                                          lowercase=True,
                                          chars=config.chars)
    processing_mor_tag = get_processing_word(vocab_mor_tags, lowercase=False)
    processing_tag = get_processing_word(vocab_tags, lowercase=False)
    processing_lex_tag = get_processing_word(vocab_lex_tags, lowercase=False)

    # get pre trained embeddings
    embeddings = get_trimmed_glove_vectors(config.trimmed_filename)

    # create dataset
    dev = Data(config.dev_filename, processing_word, processing_mor_tag,
               processing_lex_tag, processing_tag, config.max_iter)
    test = Data(config.test_filename, processing_word, processing_mor_tag,
                processing_lex_tag, processing_tag, config.max_iter)
    train = Data(config.train_filename, processing_word, processing_mor_tag,
                 processing_lex_tag, processing_tag, config.max_iter)

    cnn_model = CnnLstmCrfModel(config,
                                embeddings,
                                ntags=len(vocab_tags),
                                nchars=len(vocab_chars))
    cnn_model.build()
    cnn_model.train(train, dev, vocab_tags)
    cnn_model.evaluate(test, vocab_tags)
def main(config):
    # load vocabs
    vocab_words = load_vocab(config.words_filename)
    vocab_tags  = load_vocab(config.tags_filename)
    vocab_chars = load_vocab(config.chars_filename)

    # get processing functions
    processing_word = get_processing_word(vocab_words, vocab_chars,
                    lowercase=True, chars=config.chars)
    processing_tag  = get_processing_word(vocab_tags, 
                    lowercase=False, allow_unk=False)

    # get pre trained embeddings
    embeddings = get_trimmed_glove_vectors(config.trimmed_filename)

    # create dataset
    dev   = CoNLLDataset(config.dev_filename, processing_word,
                        processing_tag, config.max_iter)
    test  = CoNLLDataset(config.test_filename, processing_word,
                        processing_tag, config.max_iter)
    train = CoNLLDataset(config.train_filename, processing_word,
                        processing_tag, config.max_iter)

    # build model
    model = NERModel(config, embeddings, ntags=len(vocab_tags),
                                         nchars=len(vocab_chars))
    model.build()

    # train, evaluate and interact
    model.train(train, dev, vocab_tags)
    model.evaluate(test, vocab_tags)
    model.interactive_shell(vocab_tags, processing_word)
Exemple #4
0
def main(config):
    # load vocabs
    vocab_words, idx2words = load_vocab(config.words_filename)
    vocab_tags, _  = load_vocab(config.tags_filename)
    vocab_chars, _ = load_vocab(config.chars_filename)
    vocab_pos, _ = load_vocab(config.pos_filename)


    # get processing functions
    processing_word = get_processing_word(vocab_words, vocab_chars,
                    lowercase=True, chars=config.chars)

    processing_tag  = get_processing_word(vocab_tags, 
                    lowercase=False)

    processing_pos = get_processing_word(vocab_pos,
                                         lowercase=False)




    # get pre trained embeddings
    embeddings = get_trimmed_glove_vectors(config.trimmed_filename)
    embeddings_uni = get_trimmed_glove_vectors(config.uni_trimmed_filename)
    pos_embeddings = get_trimmed_glove_vectors(config.feature_trimmed_filename)
    NE_dic = get_trimmed_glove_vectors(config.trimmed_dic)


    # create dataset
    dev   = CoNLLDataset(config.dev_filename, processing_word,
                        processing_tag, processing_pos, config.max_iter)

    train = CoNLLDataset(config.train_filename, processing_word,
                        processing_tag, processing_pos, config.max_iter)
    
    # build model
    model = NERModel(config, embeddings, embeddings_uni,
                     pos_embeddings, ntags=len(vocab_tags), nchars=len(vocab_chars), vocab_words=idx2words,
                    NE_dic=NE_dic)
    model.build()

    # train, evaluate and interact
    if state == "train":
        model.train(train, dev, vocab_tags)

    elif state == "evaluate":
        model.evaluate(dev, vocab_tags)

    else: #state == predict
        convert(file)
        t2o("data_format/test_convert.txt","data_format/test.txt")
        test = CoNLLDataset(config.test_filename, processing_word,
                            processing_tag, processing_pos, config.max_iter)

        model.evaluate(test, vocab_tags)

        tagging("data_format/test_convert.txt")
Exemple #5
0
def build_data(config):
    """
    Procedure to build data

    Args:
        config: defines attributes needed in the function
    Returns:
        creates vocab files from the datasets
        creates a npz embedding file from trimmed glove vectors
    """
    processing_word = get_processing_word(lowercase=True)
    processing_word = get_processing_word(lowercase=True)

    # clean data
    train_filepath, dev_filepath_a = write_clear_data(
        config.train_filename,
        build_dev=config.build_dev_from_trainset,
        dev_ratio=config.dev_ratio)
    test_filepath, dev_filepath_b = write_clear_data(
        config.test_filename,
        build_dev=config.build_dev_from_testset,
        dev_ratio=config.dev_ratio)
    dev_filepath = dev_filepath_a or dev_filepath_b

    # Generators
    dev = Dataset(dev_filepath, processing_word)
    test = Dataset(test_filepath, processing_word)
    train = Dataset(train_filepath, processing_word)

    # Build Word and Tag vocab
    vocab_words, vocab_tags = get_vocabs([train, dev, test])
    vocab_glove = get_glove_vocab(config.glove_filename)

    vocab = vocab_words & vocab_glove
    vocab.add(UNK)
    vocab.add(NUM)

    # Save vocab
    write_vocab(vocab, config.words_filename)
    write_vocab(vocab_tags, config.tags_filename)

    # Trim GloVe Vectors
    vocab = load_vocab(config.words_filename)
    export_trimmed_glove_vectors(vocab, config.glove_filename,
                                 config.trimmed_filename, config.dim)

    # Build and save char vocab
    train = Dataset(train_filepath)
    vocab_chars = get_char_vocab(train)
    write_vocab(vocab_chars, config.chars_filename)
def main(config):
    # load vocabs
    vocab_words = load_vocab(config.words_filename)
    vocab_chars = load_vocab(config.chars_filename)

    # get processing functions
    processing_word = get_processing_word(vocab_words,
                                          vocab_chars,
                                          lowercase=True,
                                          chars=True)

    # get pre trained embeddings
    embeddings = get_trimmed_glove_vectors(config.trimmed_filename)

    # create dataset
    dev = AnnotationDataset(config.dev_filename, processing_word)
    test = AnnotationDataset(config.test_filename, processing_word)
    train = AnnotationDataset(config.train_filename, processing_word)

    print("Num. train: %d" % len(train))
    print("Num. test: %d" % len(test))
    print("Num. dev: %d" % len(dev))

    model = WImpModel(config,
                      embeddings,
                      ntags=config.nclass,
                      nchars=len(vocab_chars))

    # build WImpModel
    model.build_graph()

    # train, evaluate and interact
    model.train(train, dev)
    model.evaluate(test)
Exemple #7
0
def build_data(config):
    """
    Procedure to build data
    Args:
        config: defines attributes needed in the function
    Returns:
        creates vocab files from the datasets
        creates a npz embedding file from trimmed glove vectors
    """
    processing_word = get_processing_word(lowercase=config.lowercase)

    # Generators
    dev = CoNLLDataset(config.dev_filename, processing_word)
    test = CoNLLDataset(config.test_filename, processing_word)
    train = CoNLLDataset(config.train_filename, processing_word)

    # Build Word and Tag vocab
    vocab_words, vocab_tags = get_vocabs([train, dev, test])
    vocab_glove = get_glove_vocab(config.glove_filename)

    vocab = vocab_words & vocab_glove
    vocab.add(UNK)
    vocab.add(NUM)

    # Save vocab
    write_vocab(vocab, config.words_filename)
    write_vocab(vocab_tags, config.tags_filename)

    # Trim GloVe Vectors
    vocab = load_vocab(config.words_filename)
    export_trimmed_glove_vectors(vocab, config.glove_filename,
                                 config.trimmed_filename, config.dim)
Exemple #8
0
    def rec(sentence):
        try:

            processing_word = get_processing_word(nlu.vocab_words,
                                                  lowercase=config.lowercase)
            # print character_separation(sentence)[0]

            words_raw = character_separation(sentence)[0].split(' ')
            # for word in words_raw:
            #     if type(word)==str:
            words_raw = [unicode(word, 'utf-8') for word in words_raw]
            # words_raw = [word.decode('utf-8') for word in words_raw]
            # else:
            # words_raw = [unicode(word, 'utf-8') for word in words_raw]

            words = map(processing_word, words_raw)
            words = list(words)
            pred_ids, _ = nlu.model.predict_batch(nlu.sess, [words])
            preds = map(lambda idx: nlu.idx_to_tag[idx], list(pred_ids[0]))
            # print(list(preds))
            print_sentence(nlu.model.logger, {"x": words_raw, "y": preds})
            return list(preds)
        except EOFError:
            print("Closing session.")


# nlu.rec('请播放电视剧三生三世十里桃花')
# nlu.rec('请播放电视剧三生三世十里桃花')
# nlu.rec('请播放电视剧三生三世十里桃花')
Exemple #9
0
def build_data(config):
    """
    Procedure to build data

    Args:
        config: defines attributes needed in the function
    Returns:
        creates vocab files from the datasets
        creates a npz embedding file from trimmed glove vectors
    """
    processing_word = get_processing_word(lowercase=True)

    # Generators
    dev = CoNLLDataset(config.dev_filename, processing_word)
    #test  = CoNLLDataset(config.test_filename, processing_word)
    train = CoNLLDataset(config.train_filename, processing_word)

    # Build Word and Tag vocab
    vocab_words, vocab_tags, vocab_pos = get_vocabs([train, dev])
    vocab_glove = get_glove_vocab(config.glove_filename)
    vocab_glove_uni = get_glove_vocab(config.glove_uni_filename)

    vocab_feature = get_pos_glove_vocab(config.glove_filename)

    # vocab = vocab_words & vocab_glove
    vocab = vocab_glove | vocab_words
    vocab.add(UNK)
    vocab.add(NUM)

    vocab_pos = vocab_feature
    vocab_pos.add(UNK)
    vocab_pos.add(NUM)

    # Save vocab
    write_vocab(vocab, config.words_filename)
    write_vocab(vocab_glove_uni, config.uni_words_filename)
    write_vocab(vocab_tags, config.tags_filename)
    write_vocab(vocab_pos, config.pos_filename)

    # Trim GloVe Vectors
    vocab = load_vocab(config.words_filename)
    export_trimmed_glove_vectors(vocab, config.glove_filename,
                                 config.trimmed_filename, config.t_dim)

    vocab = load_vocab(config.uni_words_filename)

    export_trimmed_uni_vectors(vocab, config.NEdic_filename,
                               config.trimmed_dic, config.dic_dim)

    export_trimmed_uni_vectors(vocab, config.glove_uni_filename,
                               config.uni_trimmed_filename, config.dim)

    vocab_feature = load_vocab(config.pos_filename)
    export_trimmed_pos_vectors(vocab_feature, config.glove_feature,
                               config.feature_trimmed_filename, config.pos_dim)

    # Build and save char vocab
    train = CoNLLDataset(config.train_filename)
    vocab_chars = get_char_vocab(train)
    write_vocab(vocab_chars, config.chars_filename)
Exemple #10
0
def build_joint_vocab(config):

    # Common options for all datasets
    processing_word = get_processing_word(lowercase=True)
    vocab_glove = get_glove_vocab(config.filename_glove)

    # Compute and save individual vocab
    v1_words, v1_chars = get_conll2005_vocab(config.conll2005, processing_word,
                                             vocab_glove)
    v2_words, v2_chars = get_conll2003_vocab(config.conll2003, processing_word,
                                             vocab_glove)
    v3_words, v3_chars = get_semcor_vocab(config.semcor, processing_word,
                                          vocab_glove)

    print(" *** Joint vocabulary ***")
    vocab_words = v1_words.union(v2_words, v3_words)
    vocab_chars = v1_chars.union(v2_chars, v3_chars)

    # Save combined vocab
    write_vocab(vocab_words, config.filename_words)
    write_vocab(vocab_chars, config.filename_chars)

    # Trim GloVe Vectors
    vocab = load_vocab(config.filename_words)
    export_trimmed_glove_vectors(vocab, config.filename_glove,
                                 config.filename_trimmed, config.dim_word)
Exemple #11
0
 def load(self):
     self.vocab_tags = load_vocab(self.filename_tags)
     self.processing_tag = get_processing_word(self.vocab_tags,
                                               lowercase=False,
                                               allow_unk=False)
     self.ntags = len(self.vocab_tags)
     self.early_stop_metric_sign = -1 if self.stop_direction == 'increase' else 1
Exemple #12
0
def build_data(config):
    processing_word = get_processing_word()

    dev = CoNLLDataset(config.dev_filename, processing_word)
    test = CoNLLDataset(config.test_filename, processing_word)
    train = CoNLLDataset(config.train_filename, processing_word)

    vocab_words, vocab_tags, vocab_poss = get_vocabs([train, dev, test])
    vocab_glove = get_glove_vocab(config.glove_filename)

    vocab = vocab_words & vocab_glove
    vocab.add(UNK)
    vocab.add(NUM)

    write_vocab(vocab, config.words_filename)
    write_vocab(vocab_tags, config.tags_filename)
    write_vocab(vocab_poss, config.poss_filename)

    vocab = load_vocab(config.words_filename)
    export_trimmed_glove_vectors(vocab, config.glove_filename,
                                 config.trimmed_filename, config.dim)

    train = CoNLLDataset(config.train_filename)
    vocab_chars = get_char_vocab(train)
    write_vocab(vocab_chars, config.chars_filename)
Exemple #13
0
def main(config):
    # load vocabs
    vocab_words = load_vocab(config.words_filename)
    vocab_tags = load_vocab(config.tags_filename)
    vocab_chars = load_vocab(config.chars_filename)
    vocab_pref_suff = load_vocab(
        config.PS_filename)  ############### For prefix and suffix
    vocab_pref_suff_2 = load_vocab(config.PS_filename_2)
    vocab_pref_suff_4 = load_vocab(config.PS_filename_4)
    # get processing functions
    processing_word = get_processing_word(vocab_words,
                                          vocab_chars,
                                          vocab_pref_suff,
                                          vocab_pref_suff_2,
                                          vocab_pref_suff_4,
                                          lowercase=True,
                                          chars=config.chars,
                                          Pref_Suff=config.pref_suff)
    processing_tag = get_processing_word(vocab_tags,
                                         lowercase=False,
                                         Geoparser=True)

    # get pre trained embeddings
    embeddings = get_trimmed_glove_vectors(config.trimmed_filename)

    ##create dataset
    dev = CoNLLDataset(
        config.dev_filename,
        processing_word,  ############ Here dev, test and train have the raw words and tags. Now we have to map these to corresponding word index
        processing_tag,
        config.max_iter
    )  ############ and tags index. Therefore, when we do model.evaluate in below lines, it calls run_evaluate in run_epoch function
    test = CoNLLDataset(config.test_filename, processing_word, processing_tag,
                        config.max_iter)
    train = CoNLLDataset(config.train_filename, processing_word,
                         processing_tag, config.max_iter)

    # build model
    model = NERModel(config,
                     embeddings,
                     ntags=len(vocab_tags),
                     nchars=len(vocab_chars))
    model.build()

    # train, evaluate and interact
    model.train(train, dev, vocab_tags)
    model.evaluate(test, vocab_tags)
Exemple #14
0
def main(config):
    # load vocabs
    vocab_words = load_vocab(config.words_filename)
    vocab_tags = load_vocab(config.tags_filename)
    vocab_chars = load_vocab(config.chars_filename)
    vocab_iob = {"O": 0, "B": 1, "I": 2}
    vocab_type = {"LOC": 0, "PER": 1, "ORG": 2, "MISC": 3}

    # get processing functions
    processing_word = get_processing_word(vocab_words,
                                          vocab_chars,
                                          lowercase=True,
                                          chars=config.chars)
    processing_tag = get_processing_word(vocab_tags, lowercase=False)
    processing_iob = get_processing_word(vocab_iob, lowercase=False)
    processing_type = get_processing_word(vocab_type, lowercase=False)

    # get pre trained embeddings
    embeddings = get_trimmed_glove_vectors(config.trimmed_filename)

    # create dataset
    dev = CoNLLDataset(config.dev_filename, processing_word, processing_tag,
                       processing_iob, processing_type, config.max_iter,
                       config.chars)
    test = CoNLLDataset(config.test_filename, processing_word, processing_tag,
                        processing_iob, processing_type, config.max_iter,
                        config.chars)
    train = CoNLLDataset(config.train_filename, processing_word,
                         processing_tag, processing_iob, processing_type,
                         config.max_iter, config.chars)

    model = NERModel(config,
                     embeddings,
                     ntags=len(vocab_tags),
                     nchars=len(vocab_chars),
                     niob=3,
                     ntype=4)

    model.build()

    # train, evaluate and interact
    print vocab_tags
    model.train(train, dev, vocab_tags)

    stime = time.time()
    model.evaluate(test, vocab_tags)
    print time.time() - stime
Exemple #15
0
def build_data(config, logger):
    """
    Procedure to build data
    """
    processing_word = get_processing_word(lowercase=config.lowercase)

    # Generators
    test = CoNLLDataset(config.test_filename, processing_word)
    dev = CoNLLDataset(config.dev_filename, processing_word)
    train = CoNLLDataset(config.train_filename, processing_word)

    # Build Word and Tag vocab
    print("Build Word and Tag vocab...")
    vocab_words, vocab_poss, vocab_chunks, \
    vocab_aspect_tags, vocab_polarity_tags, vocab_joint_tags = get_vocabs([train, dev, test])
    vocab = vocab_words
    vocab.add(UNK)
    vocab.add(NUM)

    # Save vocab
    print("Dealing words vocab...")
    write_vocab(vocab, config.words_filename)
    print("Dealing poss vocab...")
    write_vocab(vocab_poss, config.poss_filename)

    vocab_chunks = [tags for tags in vocab_chunks]
    if "NO" in vocab_chunks:
        vocab_chunks.remove("NO")
        vocab_chunks.insert(0, "NO")
    else:
        logger.error(">>> vocab_chunks used as mpqa has something wrong!")
    print("Dealing chunks vocab...")
    write_vocab(vocab_chunks, config.chunks_filename)

    vocab_aspect_tags = [tags for tags in vocab_aspect_tags]
    vocab_aspect_tags.remove("O")
    vocab_aspect_tags.insert(0, "O")
    vocab_polarity_tags = [tags for tags in vocab_polarity_tags]
    vocab_polarity_tags.remove("O")
    vocab_polarity_tags.insert(0, "O")
    vocab_joint_tags = [tags for tags in vocab_joint_tags]
    vocab_joint_tags.remove("O")
    vocab_joint_tags.insert(0, "O")
    print("Dealing aspect_tags vocab...")
    write_vocab(vocab_aspect_tags, config.aspect_tags_filename)
    print("Dealing polarity_tags vocab...")
    write_vocab(vocab_polarity_tags, config.polarity_tags_filename)
    print("Dealing joint_tags vocab...")
    write_vocab(vocab_joint_tags, config.joint_tags_filename)

    vocab = load_vocab(config.words_filename)
    export_trimmed_glove_vectors(vocab, config.domain_filename,
                                 config.domain_trimmed_filename,
                                 config.dim_domain)
    export_trimmed_glove_vectors(vocab, config.general_filename,
                                 config.general_trimmed_filename,
                                 config.dim_general)
Exemple #16
0
def test_processing_words_with_words_idx_dict_and_allow_unknow():
    d = dict()
    d['娃哈哈'] = 1
    d['#####'] = 3
    d['<UNK>'] = 0
    processing_word = get_processing_word(d, True)
    word1 = processing_word("娃哈哈")
    word2 = processing_word("12345")
    word3 = processing_word("xixihehe")
    print(word1, word2, word3)
def main(config):
    # load vocabs
    vocab_words = load_vocab(config.words_filename)
    vocab_tags  = load_vocab(config.tags_filename)
    vocab_chars = load_vocab(config.chars_filename)
    dictionary = load_vocab("data/types.txt")
    types_dic = collections.OrderedDict([(v, k) for k, v in dictionary.items()])
    vocab_iob = {"O":0, "B":1, "I":2}
    vocab_type = load_vocab(config.types_filename)
    print vocab_type
    # get processing functions
    processing_word = get_processing_word(vocab_words, vocab_chars,
                    lowercase=True, chars=config.chars)
    processing_tag  = get_processing_word(vocab_tags, 
                    lowercase=False)
    processing_iob = get_processing_word(vocab_iob, 
                    lowercase=False)
    processing_type = get_processing_word(vocab_type, 
                    lowercase=False)

    # get pre trained embeddings
    embeddings = get_trimmed_glove_vectors(config.trimmed_filename)

    # create dataset
    dev   = CoNLLDataset(config.dev_filename, processing_word,
                        processing_tag, processing_iob, processing_type, config.max_iter, config.chars)
    test  = CoNLLDataset(config.test_filename, processing_word,
                        processing_tag, processing_iob, processing_type, config.max_iter, config.chars)
    train = CoNLLDataset(config.train_filename, processing_word,
                        processing_tag, processing_iob, processing_type, config.max_iter, config.chars)

    ntype = len(vocab_type)
    model = POSmodel(config, embeddings, ntags=len(vocab_tags),
                                         nchars=len(vocab_chars),
                                         niob=3,
                                         ntype=ntype)

    model.build()

    model.train(train, dev, vocab_type)

    model.evaluate(test, vocab_type)
Exemple #18
0
    def load(self):
        #load vocab dictionary
        self.vocab_words = load_dict(self.f_words)
        self.vocab_tags = load_dict(self.f_tags)
        self.vocab_chars = load_dict(self.f_chars)

        self.num_word = len(self.vocab_words)
        self.num_tag = len(self.vocab_tags)
        self.num_char = len(self.vocab_chars)

        #processing to map string to id
        self.processing_word = get_processing_word(self.vocab_words,
                                                   self.vocab_chars,
                                                   lowercase=True,
                                                   chars=self.use_chars)
        self.processing_tag = get_processing_word(self.vocab_tags,
                                                  lowercase=False,
                                                  allow_unk=False)

        #pretrained embedding
        self.embbedings = (processing_trimmed_glove_vector(self.f_trimmed)
                           if self.use_pretrained else None)
Exemple #19
0
def build_data(config, logger):
    """
    Procedure to build data
    """

    # Generators
    processing_word = get_processing_word(lowercase=config.lowercase)
    test = CoNLLDataset(config.test_filename, processing_word)
    dev = CoNLLDataset(config.dev_filename, processing_word)
    train = CoNLLDataset(config.train_filename, processing_word)

    # Build Word and Tag vocab
    logger.info("Build Word and Tag vocab...")
    vocab_words, vocab_poss, vocab_chunks, vocab_tags = get_vocabs(
        [train, dev, test])
    vocab = vocab_words
    vocab.add(UNK)
    vocab.add(NUM)

    # Save vocab
    write_vocab(vocab, config.words_filename)
    vocab_tags = [tags for tags in vocab_tags]
    vocab_tags.remove("O")
    vocab_tags.insert(0, "O")
    write_vocab(vocab_tags, config.tags_filename)

    # Trim GloVe Vectors
    vocab = load_vocab(config.words_filename)
    export_trimmed_glove_vectors(vocab, config.glove_filename,
                                 config.trimmed_filename, config.dim)

    # Build and save char vocab
    logger.info("Build chars vocab...")
    train = CoNLLDataset(config.train_filename)
    vocab_chars = get_char_vocab(train)
    write_vocab(vocab_chars, config.chars_filename)

    # Build and save Depstree
    processing_relation = get_processing_relation()
    dev_deps = DepsDataset(config.dev_deps_filename, processing_word,
                           processing_relation)
    train_deps = DepsDataset(config.train_deps_filename, processing_word,
                             processing_relation)

    logger.info("Build relations vocab...")
    vocab_relations = get_relations_vocabs([train_deps, dev_deps])
    vocab_relations.add(UNK)
    write_vocab(vocab_relations, config.relations_filename)
def build_data(config):
    """
    Procedure to build data

    Args:
        config: defines attributes needed in the function
    Returns:
        creates vocab files from the datasets
        creates a npz embedding file from trimmed glove vectors
    """
    processing_word = get_processing_word(lowercase=True)

    # Generators
    dev   = CoNLLDataset(config.dev_filename, processing_word)
    test  = CoNLLDataset(config.test_filename, processing_word)
    train = CoNLLDataset(config.train_filename, processing_word)

    # Build Word and Tag vocab
    vocab_words, vocab_tags = get_vocabs([train, dev, test])
    vocab_glove = get_glove_vocab(config.glove_filename)
    vocab = vocab_words & vocab_glove
    vocab.add(UNK)
    vocab.add(NUM)
    vocab.add(PAD)

    # Save vocab
    write_vocab(vocab, config.words_filename)
    write_vocab(vocab_tags, config.tags_filename)

    # Trim GloVe Vectors
    vocab = load_vocab(config.words_filename)
    export_trimmed_glove_vectors(vocab, config.glove_filename, 
                                config.trimmed_filename, config.dim)

    # Build and save char vocab
    train = CoNLLDataset(config.train_filename, processing_word)
    vocab_chars = get_char_vocab(train)
    write_vocab(vocab_chars, config.chars_filename)


    # Build and save type vocab
    vocab_types = set()
    print len(vocab_tags)
    for tag in vocab_tags:
        if tag != 'O':
            vocab_types.add(tag[2:])
    write_vocab(vocab_types, config.types_filename)
Exemple #21
0
def test_dataset():
    # test getDataset and get_vocabs
    processing_word = get_processing_word()
    dev = getDataset("../data/test_ner.txt", processing_word)
    vocab_words, vocab_tags = get_vocabs([dev])

    # get common vocab from dev file and polyglot
    vocab_poly = get_polyglot_vocab("../data/polyglot-zh.pkl")
    vocab = vocab_words & vocab_poly
    vocab.add(UNK)

    write_vocab(vocab, "../data/words.txt")
    write_vocab(vocab_tags, "../data/tags.txt")

    vocab = load_vocab("../data/words.txt")
    export_trimmed_polyglot_vectors(vocab, "../polyglot-zh.pkl", "../data/polyglot.trimmed.npz", 64)
    data = get_trimmed_polyglot_vectors("../data/polyglot.trimmed.npz")
Exemple #22
0
    def load(self):
        """Loads vocabulary, processing functions and embeddings
        """
        # 1. vocabulary
        self.vocab_words = load_vocab(self.filename_words)
        self.vocab_chars = load_vocab(self.filename_chars)

        self.nwords = len(self.vocab_words)
        self.nchars = len(self.vocab_chars)

        # 2. get processing functions that map str -> id
        self.processing_word = get_processing_word(self.vocab_words,
                                                   self.vocab_chars,
                                                   lowercase=True,
                                                   chars=self.use_chars)

        # 3. get pre-trained embeddings
        self.embeddings = (get_trimmed_glove_vectors(self.filename_trimmed)
                           if self.use_pretrained else None)
def build_data(config):
    """
    Procedure to build data

    Args:
        config: defines attributes needed in the function
    Returns:
        creates vocab files from the datasets
        creates a npz embedding file from trimmed glove vectors
    """
    processing_word = get_processing_word(lowercase=config.lowercase)

    # Generators
    dev   = CoNLLDataset(config.dev_filename, processing_word)
    test  = CoNLLDataset(config.test_filename, processing_word)
    train = CoNLLDataset(config.train_filename, processing_word)

    # Build Word and Tag vocab
    vocab_words, vocab_tags = get_vocabs([train, dev, test])
    vocab_glove = get_glove_vocab(config.glove_filename)

    vocab = vocab_words & vocab_glove
    vocab.add(UNK)
    vocab.add(NUM)

    # Save vocab
    write_vocab(vocab, config.words_filename)
    write_vocab(vocab_tags, config.tags_filename)

    # Trim GloVe Vectors
    vocab = load_vocab(config.words_filename)
    export_trimmed_glove_vectors(vocab, config.glove_filename, 
                                config.trimmed_filename, config.dim)

    # Build and save char vocab
    train = CoNLLDataset(config.train_filename)
    vocab_chars = get_char_vocab(train)
    write_vocab(vocab_chars, config.chars_filename)
def main(pretrained_embeddings_file=None,
         filtered_embeddings_file="data/filtered_embeddings.txt"):
    words_file = "data/words.txt"
    tags_file = "data/tags.txt"
    chars_file = "data/chars.txt"
    test_file = 'data/eng.testa'
    train_file = 'data/eng.train'

    processing_word = get_processing_word(lowercase=False)

    test = CoNLLDataset(test_file, processing_word)
    train = CoNLLDataset(train_file, processing_word)

    vocab_words, vocab_tags = get_vocabs([train, test])
    vocab = set(vocab_words)
    if pretrained_embeddings_file:
        embedding_vocab = get_embedding_vocab(pretrained_embeddings_file)
        vocab &= embedding_vocab
        print('{} overlapping words'.format(len(vocab)))

    vocab.add(UNK)
    vocab.add(NUM)
    vocab = list(vocab)
    # TODO: there's probably no need for these anymore, check and remove, if this is the case
    vocab.insert(TOKEN2IDX[PAD], PAD)
    vocab.insert(TOKEN2IDX[START_TAG], START_TAG)
    vocab.insert(TOKEN2IDX[STOP_TAG], STOP_TAG)
    print(len(vocab))

    write_vocab(vocab, words_file)
    write_vocab(vocab_tags, tags_file)

    if pretrained_embeddings_file:
        filter_embeddings_in_vocabulary(words_file, pretrained_embeddings_file,
                                        filtered_embeddings_file)

    vocab_chars = get_char_vocab(vocab_words)
    write_vocab(vocab_chars, chars_file)
Exemple #25
0
def main():
    """Procedure to build data

    You MUST RUN this procedure. It iterates over the whole dataset (train,
    dev and test) and extract the vocabularies in terms of words, tags, and
    characters. Having built the vocabularies it writes them in a file. The
    writing of vocabulary in a file assigns an id (the line #) to each word.
    It then extract the relevant GloVe vectors and stores them in a np array
    such that the i-th entry corresponds to the i-th word in the vocabulary.


    Args:
        config: (instance of Config) has attributes like hyper-params...

    """
    # get config and processing of words
    config = Config(load=False)
    processing_word = get_processing_word()

    # Generators
    dev = CoNLLDataset(config.filename_dev, processing_word)
    test = CoNLLDataset(config.filename_test, processing_word)
    train = CoNLLDataset(config.filename_train, processing_word)
Exemple #26
0
# load vocabs
vocab_words = load_vocab(config.words_filename)
vocab_tags = load_vocab(config.tags_filename)
vocab_chars = load_vocab(config.chars_filename)
vocab_morphs = load_vocab(config.morphs_filename)  #morphs add
vocab_syls = load_vocab(config.word_syl_filename)
pos_tags = load_vocab(config.posTag_filename)  #pos tag adding----
dic_words = load_vocab(config.word_dic_filename)  #dic add

# get processing functions
processing_word = get_processing_word(vocab_words,
                                      dic_words,
                                      vocab_chars,
                                      vocab_morphs,
                                      vocab_syls,
                                      pos_tags,
                                      lowercase=True,
                                      chars=config.chars,
                                      morphs=config.morphs,
                                      posflag=config.posTag,
                                      pos_lm=config.posLM,
                                      dic_flag=config.dic_flag)
processing_tag = get_processing_word(vocab_tags, lowercase=False)
processing_pos = get_processing_word(pos_tags=pos_tags,
                                     posflag=True,
                                     lowercase=True,
                                     pos_lm=True)

# get pre trained embeddings
embeddings = get_trimmed_glove_vectors(config.trimmed_filename)
dic_embeddings = get_exported_dic_vectors(config.exported_filename)
morph_embeddings = get_exported_morph_vectors(config.exported_mfilename)
Exemple #27
0
from data_utils import get_trimmed_glove_vectors, load_vocab, \
    get_processing_word, CoNLLDataset
from model import NERModel
from config import Config

# create instance of config
config = Config()

# load vocabs
vocab_words = load_vocab(config.words_filename)
vocab_tags  = load_vocab(config.tags_filename)
vocab_chars = load_vocab(config.chars_filename)

# get processing functions
processing_word = get_processing_word(vocab_words, vocab_chars,
                lowercase=True, chars=config.chars)
processing_tag  = get_processing_word(vocab_tags, 
                lowercase=False)

# get pre trained embeddings
embeddings = get_trimmed_glove_vectors(config.trimmed_filename)

# create dataset
dev   = CoNLLDataset(config.dev_filename, processing_word,
                    processing_tag, config.max_iter)
test  = CoNLLDataset(config.test_filename, processing_word,
                    processing_tag, config.max_iter)
train = CoNLLDataset(config.train_filename, processing_word,
                    processing_tag, config.max_iter)

# build model
Exemple #28
0
def build_data(config):
    annotations = []
    meta_filename = 'sw%s%s-ms98-a-trans.text'  # % (file_id, speaker_id)

    for idx in os.listdir(config.wimp_corpus):
        idx_path = os.path.join(config.wimp_corpus, idx)
        if os.path.isfile(idx_path):
            continue

        for file_id in os.listdir(idx_path):
            folder = os.path.join(idx_path, file_id)
            if os.path.isfile(folder):
                continue

            wimp_trans_files = [
                os.path.join(folder, meta_filename % (file_id, 'A')),
                os.path.join(folder, meta_filename % (file_id, 'B'))
            ]

            swd_trans_files = [
                os.path.join(config.swd_transcripts, idx, file_id,
                             meta_filename % (file_id, 'A')),
                os.path.join(config.swd_transcripts, idx, file_id,
                             meta_filename % (file_id, 'B'))
            ]

            for i, wimp_trans_file in enumerate(wimp_trans_files):
                swd_trans_file = swd_trans_files[i]
                file_id, speaker = swd_trans_file.split("/")[-2:]
                speaker = speaker[6]
                with open(wimp_trans_file) as w_file_obj, open(
                        swd_trans_file) as s_file_obj:
                    for line_num, (anns_, wrds_) in enumerate(
                            zip(w_file_obj, s_file_obj)):
                        sentence = []
                        anns = anns_.strip().split(' ')[3:]
                        wrds = wrds_.strip().split(' ')[3:]
                        assert(len(anns) == len(wrds)), \
                        "file mismatch, line %d : %s and %s" % (line_num, swd_trans_file, wimp_trans_file)

                        for id_, wrd in enumerate(wrds):
                            wrd = clean_word(wrd)
                            if wrd != '':
                                sentence.append([(file_id, line_num, speaker),
                                                 wrd,
                                                 float(anns[id_])])

                        if len(sentence) != 0:
                            annotations.append(sentence)

    random.shuffle(annotations)

    #80% for training, 10% dev, 10% test
    d_train = annotations[:0.8 * len(annotations)]
    d_test = annotations[0.8 * len(annotations):0.9 * len(annotations)]
    d_dev = annotations[0.9 * len(annotations):]

    def prep_text_data(D, outfile):
        with open(outfile, 'w') as f:
            for sent in D:
                for _, word, label in sent:
                    f.write("%s %f\n" % (word, label))
                f.write("\n")

    prep_text_data(d_train, config.train_filename)
    prep_text_data(d_test, config.test_filename)
    prep_text_data(d_dev, config.dev_filename)

    processing_word = get_processing_word(lowercase=True)

    # Generators
    dev = AnnotationDataset(config.dev_filename, processing_word)
    test = AnnotationDataset(config.test_filename, processing_word)
    train = AnnotationDataset(config.train_filename, processing_word)

    # Build Word and Tag vocab
    # Vocabulary is built using training data
    vocab_words, vocab_tags = get_vocabs([train])
    vocab_glove = get_glove_vocab(config.glove_filename)

    vocab = vocab_words & vocab_glove
    vocab.add(UNK)
    vocab.add(NUM)

    # Save vocab
    write_vocab(vocab, config.words_filename)
    write_vocab(vocab_tags, config.tags_filename)

    # Trim GloVe Vectors
    vocab = load_vocab(config.words_filename)
    export_trimmed_glove_vectors(vocab, config.glove_filename,
                                 config.trimmed_filename, config.dim)

    # Build and save char vocab
    train = AnnotationDataset(config.train_filename)
    vocab_chars = get_char_vocab(train)
    write_vocab(vocab_chars, config.chars_filename)
Exemple #29
0
from data_utils import get_trimmed_glove_vectors, load_vocab, \
    get_processing_word, CoNLLDataset
from general_utils import get_logger
from model import NERModel
from config import config

# directory for training outputs
if not os.path.exists(config.output_path):
    os.makedirs(config.output_path)

# load vocabs
vocab_words = load_vocab(config.words_filename)
vocab_tags = load_vocab(config.tags_filename)

# get processing functions
processing_word = get_processing_word(vocab_words, lowercase=config.lowercase)
processing_tag = get_processing_word(vocab_tags, lowercase=False)

# get pre trained embeddings

embeddings = get_trimmed_glove_vectors(config.trimmed_filename)

# create dataset
dev = CoNLLDataset(config.dev_filename, processing_word, processing_tag,
                   config.max_iter)
test = CoNLLDataset(config.test_filename, processing_word, processing_tag,
                    config.max_iter)
train = CoNLLDataset(config.train_filename, processing_word, processing_tag,
                     config.max_iter)

# get logger
Exemple #30
0
import numpy as np
import os
import tensorflow as tf
from config import Config
from sklearn.model_selection import train_test_split
from dataobject import CoNLLDataset
from data_utils import get_vocabs, UNK, NUM, \
    get_glove_vocab, write_vocab, load_vocab, get_char_vocab, \
    export_trimmed_glove_vectors, get_processing_word

# Create instance of config
config = Config()

processing_word = get_processing_word(lowercase=True)

# Generators
dev   = CoNLLDataset(config.filename_dev, processing_word)
test  = CoNLLDataset(config.filename_test, processing_word)
train = CoNLLDataset(config.filename_train, processing_word)

# Build Word and Tag vocab
vocab_words, vocab_tags = get_vocabs([train, dev, test])
vocab_glove = get_glove_vocab(config.filename_glove)
vocab = vocab_words & vocab_glove
vocab.add(config.UNK)
vocab.add(config.NUM)

# Save vocab
write_vocab(vocab, config.filename_words)
write_vocab(vocab_tags, config.filename_tags)
Exemple #31
0
def build_data(config):
    """
    Procedure to build data

    Args:
        config: defines attributes needed in the function
    Returns:
        creates vocab files from the datasets
        creates a npz embedding file from trimmed glove vectors
    """
    processing_word = get_processing_word(lowercase=True)

    # Generators
    dev = CoNLLDataset(config.dev_filename, processing_word)
    test = CoNLLDataset(config.test_filename, processing_word)
    train = CoNLLDataset(config.train_filename, processing_word)

    # Build Word and Tag vocab
    vocab_words, vocab_tags, vocab_pos = get_vocabs([train, dev,
                                                     test])  #pos adding-----
    vocab_glove = get_glove_vocab(config.glove_filename)
    vocab_dic = get_dic_vocab(config.dic_filename, 1)  #add dic vector get
    vocab_syl = get_dic_vocab(config.syl_filename, 1)  #add syl vector
    vocab_morph = get_morph_vocab(config.morph_vec_filename)  #morph vector get

    vocab = vocab_words & vocab_glove
    vocab.add(UNK.decode('utf-8'))
    vocab.add(NUM.decode('utf-8'))

    word_dic = vocab_dic  #add dic
    word_dic.add(UNK.decode('utf-8'))
    word_dic.add(NUM.decode('utf-8'))

    word_syl = vocab_syl  #add syl
    word_syl.add(UNK.decode('utf-8'))
    word_syl.add(NUM.decode('utf-8'))

    word_morph = vocab_morph  # add morph
    word_morph.add(UNK.decode('utf-8'))
    word_morph.add(NUM.decode('utf-8'))

    vocab_pos.add(UNK.decode('utf-8'))

    # Save vocab
    write_vocab(vocab, config.words_filename)
    write_vocab(vocab_tags, config.tags_filename)
    write_vocab(word_dic, config.word_dic_filename)  #add dic
    write_vocab(word_syl, config.word_syl_filename)  #add syl
    write_vocab(word_morph, config.morphs_filename)  #add morph
    write_vocab(vocab_pos, config.posTag_filename)  #add pos

    # Trim GloVe Vectors(pretrain vector)
    vocab = load_vocab(config.words_filename)
    export_trimmed_glove_vectors(vocab, config.glove_filename,
                                 config.trimmed_filename, config.dim)
    word_dic = load_vocab(config.word_dic_filename)  #dic add
    export_dic_vectors(word_dic, config.dic_filename, config.exported_filename,
                       config.dic_dim)
    word_syl = load_vocab(config.word_syl_filename)  #syl add
    export_syl_vectors(word_syl, config.syl_filename,
                       config.exported_sfilename, config.syl_dim)
    word_morph = load_vocab(config.morphs_filename)  #morph add
    export_morph_vectors(word_morph, config.morph_vec_filename,
                         config.exported_mfilename, config.dim_morph)
    vocab_pos = load_vocab(config.posTag_filename)  #pos add
    export_pos_vectors(vocab_pos, config.pos_vec_filename,
                       config.exported_pfilename, config.dim_pos)

    # Build and save char vocab, morph vocab
    train = CoNLLDataset(config.train_filename)
    vocab_chars = get_char_vocab(train)
    write_vocab(vocab_chars, config.chars_filename)
Exemple #32
0
from data_utils import get_trimmed_glove_vectors, load_vocab, \
    get_processing_word, CoNLLDataset
from model import NERModel
from config import Config

# create instance of config
config = Config()

# load vocabs
vocab_words = load_vocab(config.words_filename)
vocab_tags = load_vocab(config.tags_filename)
vocab_chars = load_vocab(config.chars_filename)

# get processing functions
processing_word = get_processing_word(vocab_words,
                                      vocab_chars,
                                      lowercase=True,
                                      chars=config.chars)
processing_tag = get_processing_word(vocab_tags, lowercase=False)

# get pre trained embeddings
embeddings = get_trimmed_glove_vectors(config.trimmed_filename)

# create dataset
dev = CoNLLDataset(config.dev_filename, processing_word, processing_tag,
                   config.max_iter)
test = CoNLLDataset(config.test_filename, processing_word, processing_tag,
                    config.max_iter)
train = CoNLLDataset(config.train_filename, processing_word, processing_tag,
                     config.max_iter)

# build model