コード例 #1
0
def main():
    print("start time:", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))

    # create instance of config
    config = Config()
    config.dim_char = arg.dim_char
    config.hidden_size_char = arg.hidden_size_char
    config.hidden_size_lstm_1 = arg.hidden_size_lstm_1
    config.hidden_size_lstm_2 = arg.hidden_size_lstm_2
    config.batch_sample = arg.batch_sample
    config.elmo_scale = arg.elmo_scale
    config.lr_method = arg.lr_method
    config.batch_size = arg.batch_size
    config.learning_rate = arg.learning_rate
    config.decay_logic = arg.decay_logic
    config.run_name = arg.run_name

    # build model
    model = NERModel(config)
    model.build()

    # create datasets
    dev = CoNLLDataset(config.filename_dev, config.elmofile_dev,
                       config.processing_word, config.processing_postags,
                       config.generate_anchor, config.max_iter)
    train = CoNLLDataset(config.filename_train, config.elmofile_train,
                         config.processing_word, config.processing_postags,
                         config.generate_anchor, config.max_iter)

    # train model
    model.train(train, dev)
    print("end time:", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
コード例 #2
0
def main():
    config = Config()

    #-------------------------------------------------------------------
    # build model
    # ------------------------------------------------------------------
    model = NERModel(config)
    model.build()

    # ------------------------------------------------------------------
    # train mode
    # ------------------------------------------------------------------
    if config.mode == 'train':
        print('\n ... training model ... \n')
        test = CoNLLDataset(config.filename_test, config.processing_word,
                         config.processing_tag, config.max_iter)
        if config.periodic:
            split = CoNLLDataset(config.dummy_train, config.processing_word,
                         config.processing_tag, config.max_iter)
        else:
            split = CoNLLDataset(config.train_split[config.split], config.processing_word,
                         config.processing_tag, config.max_iter)
        model.train(split, test)

    # ------------------------------------------------------------------
    # retrain mode
    # ------------------------------------------------------------------
    if config.mode == 'retrain':
        print('\n ... retraining model ... \n')
        model.restore_session(config.dir_model)
        retrain = CoNLLDataset(config.filename_retrain, config.processing_word,
                           config.processing_tag, config.max_iter)
        test = CoNLLDataset(config.filename_test, config.processing_word,
                       config.processing_tag, config.max_iter)
        model.train(retrain, test)
def main():
    # get config and processing of words
    config = Config(load=False)
    processing_word = get_processing_word(lowercase=True)

    # Generators 生成器
    dev = CoNLLDataset(config.filename_dev, processing_word)
    test = CoNLLDataset(config.filename_test, processing_word)
    train = CoNLLDataset(config.filename_train, processing_word)

    # Build Word and Tag vocab;

    vocab_words, vocab_tags = get_vocabs([train, dev, test])
    vocab_glove = get_glove_vocab(config.filename_glove)

    vocab = [i for i in vocab_words if i in vocab_glove]
    vocab.append(UNK)
    vocab.append(NUM)
    vocab.append('</pad>')
    vocab_tags.append('</pad>')

    # Save vocab
    write_vocab(vocab, config.filename_words)
    write_vocab(vocab_tags, config.filename_tags)

    # Trim GloVe Vectors
    vocab = load_vocab(config.filename_words)
    export_trimmed_glove_vectors(vocab, config.filename_glove,
                                 config.filename_trimmed, config.dim_word)

    # Build and save char vocab
    train = CoNLLDataset(config.filename_train)
    vocab_chars = get_char_vocab(train)
    vocab_chars.append('</pad>')
    write_vocab(vocab_chars, config.filename_chars)
コード例 #4
0
def evaluate():
    augment_pred = []
    with NERModel(config) as model:

        # create datasets
        augment = CoNLLDataset(config.filename_augment, config.processing_word,
                               config.processing_tag, config.max_iter)

        test = CoNLLDataset(config.filename_test, config.processing_word,
                            config.processing_tag, config.max_iter)

        # build model
        model = NERModel(config)
        model.build()
        model.restore_session(config.dir_model)

        # evaluate
        model.logger.info("\nEvaluation on Test")
        model.evaluate(test)

        model.logger.info("\nEvaluation on Augment")
        model.evaluate(augment, augment_pred)

        # model.logger.debug(augment_pred)

    # clear memory
    del model

    return augment_pred
コード例 #5
0
def main():
    # create instance of config
    config = Config()

    config.dim_char = arg.dim_char
    config.hidden_size_char = arg.hidden_size_char
    config.hidden_size_lstm_1 = arg.hidden_size_lstm_1
    config.hidden_size_lstm_2 = arg.hidden_size_lstm_2
    config.cls_hidden_size = arg.cls_hidden_size
    config.batch_sample = arg.batch_sample
    config.elmo_scale = arg.elmo_scale
    config.lr_method = arg.lr_method
    config.batch_size = arg.batch_size
    config.learning_rate = arg.learning_rate
    config.decay_logic = arg.decay_logic
    config.run_name = arg.run_name
    config.input_feature_dim = 600  #config.hidden_size_lstm * 2 #+ 1024
    config.dir_saved_roi = arg.dir_saved_roi

    # build model
    model = NERModel(config)
    model.build()

    # create datasets
    config.filename_dev = config.dir_saved_roi + "dev_word_ids/"
    #config.filename_test = config.dir_saved_roi + "test_word_ids/"
    config.filename_train = config.dir_saved_roi + "train_word_ids/"

    dev = CoNLLDataset(config.filename_dev)
    print("Loading dev set done!")
    train = CoNLLDataset(config.filename_train)
    print("Loading train set done!")

    # train model
    model.train(train, dev, config.dev_total_entity)
コード例 #6
0
def main():

    config = Config()

    # model.restore_session("results/crf/model.weights/") # optional, restore weights
    # model.reinitialize_weights("proj")

    # create datasets [(char_ids), word_id]
    processing_word = get_processing_word(lowercase=False)
    dev = CoNLLDataset(config.filename_dev, processing_word)
    train = CoNLLDataset(config.filename_train, processing_word)
    test = CoNLLDataset(config.filename_test, processing_word)
    entities = []
    for raw_words, raw_tags in test:
        chunks = get_chunks_from_tags(raw_tags)
        for _, chunk_start, chunk_end in chunks:
            entity = 'ENTITY/'
            for i in range(chunk_start, chunk_end):
                if i == chunk_end - 1:
                    entity += raw_words[i]
                else:
                    entity = entity + raw_words[i] + '_'
            entities.append(entity)
    # print(len(entities))
    # print(entities)

    entities = set(entities)
    print(len(entities))
    vocab_glove = get_glove_vocab(config.filename_glove)
    print(len(entities & vocab_glove))
コード例 #7
0
def main():
    # create instance of config
    config = Config()

    # build model
    model = NERModel(config)
    model.build("train")
    model.restore_session(config.dir_model)

    # create dataset

    # processing_word = get_processing_word(lowercase=True)

    if len(sys.argv) == 2:
        if sys.argv[1] == 'test':
            test = CoNLLDataset(config.filename_test)

        elif sys.argv[1] == 'dev':
            test = CoNLLDataset(config.filename_dev)

    else:
        assert len(sys.argv) == 1
        test = CoNLLDataset(config.filename_test)

    test4cl = CoNLLdata4classifier(test,
                                   processing_word=config.processing_word,
                                   processing_tag=config.processing_tag)

    # evaluate and interact
    model.evaluate(test4cl)
コード例 #8
0
def main():
    # create instance of config
    config = Config()
    dev = CoNLLDataset(config.filename_dev, config.processing_word,
                       config.processing_tag, config.max_iter)
    train = CoNLLDataset(config.filename_train, config.processing_word,
                         config.processing_tag, config.max_iter)
    test = CoNLLDataset(config.filename_test, config.processing_word,
                        config.processing_tag, config.max_iter)
    predict = CoNLLDataset("data/source_data.txt", config.processing_word,
                           config.max_iter)
    max_sequence_length = max(max([len(seq[0]) for seq in train]),
                              max([len(seq[0]) for seq in dev]),
                              max([len(seq[0]) for seq in test]),
                              max([len(seq[0]) for seq in predict]))

    max_word_length = max(
        max([len(word[0]) for seq in train for word in seq[0]]),
        max([len(word[0]) for seq in test for word in seq[0]]),
        max([len(word[0]) for seq in dev for word in seq[0]]))
    print(max_word_length, max_sequence_length)
    model = NERModel(config, max_word_length, max_sequence_length)
    model.build()
    model.restore_session(config.dir_model)
    model.run_predict(predict)
コード例 #9
0
def main():
    # create instance of config,这里的config实现了load data的作用
    #拥有词表、glove训练好的embeddings矩阵、str->id的function
    config = Config()

    # build model
    model = NERModel(config)
    model.build("train")

    # model.restore_session("results/crf/model.weights/") # optional, restore weights
    # model.reinitialize_weights("proj")

    # create datasets [(char_ids), word_id]
    # processing_word = get_processing_word(lowercase=True)
    dev = CoNLLDataset(config.filename_dev)
    train = CoNLLDataset(config.filename_train)
    test = CoNLLDataset(config.filename_test)

    train4cl = CoNLLdata4classifier(train,
                                    processing_word=config.processing_word,
                                    processing_tag=config.processing_tag,
                                    context_length=config.context_length)
    dev4cl = CoNLLdata4classifier(dev,
                                  processing_word=config.processing_word,
                                  processing_tag=config.processing_tag,
                                  context_length=config.context_length)
    test4cl = CoNLLdata4classifier(test,
                                   processing_word=config.processing_word,
                                   processing_tag=config.processing_tag,
                                   context_length=config.context_length)

    # train model
    model.train(train4cl, dev4cl, test4cl)
コード例 #10
0
def main():
    # create instance of config,这里的config实现了load data的作用
    #拥有词表、glove训练好的embeddings矩阵、str->id的function
    config = Config()
    config.nepochs          = 200
    config.dropout          = 0.5
    config.batch_size       = 19
    config.lr_method        = "adam"
    config.lr               = 0.0001
    config.lr_decay         = 1.0
    config.clip             = -2.0 # if negative, no clipping
    config.nepoch_no_imprv  = 5

    config.dir_model = config.dir_output + "model.finetuning.weights/"
    
    # build model
    model = NERModel(config)
    model.build("fine_tuning")
    model.restore_session("results/test/model.weights/", indicate="fine_tuning")

    # model.restore_session("results/crf/model.weights/") # optional, restore weights
    # model.reinitialize_weights("proj")

    # create datasets [(char_ids), word_id]
    dev   = CoNLLDataset(config.filename_dev, config.processing_word,
                         config.processing_tag, config.max_iter)
    train = CoNLLDataset(config.filename_train, config.processing_word,
                         config.processing_tag, config.max_iter)
    test = CoNLLDataset(config.filename_test, config.processing_word,
                         config.processing_tag, config.max_iter)

    # train model

    model.train(train, dev, test)
コード例 #11
0
ファイル: train.py プロジェクト: zxz53000/S-LSTM
def main():
    # create instance of config
    config = Config()
    config.layer=int(sys.argv[1])
    config.step=int(sys.argv[2])

    if config.task=='pos':
        print("USING POS")
        config.filename_train = "data/train.pos" # test
        config.filename_dev= "data/dev.pos"
        config.filename_test= "data/test.pos"
    else:
        print("USING NER")      
    print("iteration: "+str(config.layer))
    print("step: "+str(config.step))

    # build model
    model = NERModel(config)
    model.build()
    # model.restore_session("results/crf/model.weights/") # optional, restore weights
    # model.reinitialize_weights("proj")

    # create datasets
    dev   = CoNLLDataset(config.filename_dev, config.processing_word,
                         config.processing_tag, config.max_iter)
    train = CoNLLDataset(config.filename_train, config.processing_word,
                         config.processing_tag, config.max_iter)

    test = CoNLLDataset(config.filename_test, config.processing_word,
                        config.processing_tag, config.max_iter)
    # train model
    model.train(train, dev, test)
コード例 #12
0
def main():
    # create instance of config
    config = Config()

    #build model
    model = BLSTMCRF(config)  #Word_BLSTM(config)
    #model = Word_BLSTM(config)
    model.build()
    model.compile(optimizer=model.get_optimizer(),
                  loss=model.get_loss())  #, metrics=['acc']

    #model.summary()
    # Loading weights
    #model.load_weights('./saves/test20.h5')

    # create datasets
    dev = CoNLLDataset(
        config.filename_train,
        config.processing_word,  #filename_dev
        config.processing_tag,
        config.max_iter)
    train = CoNLLDataset(config.filename_train, config.processing_word,
                         config.processing_tag, config.max_iter)

    model.summary()
    # train model
    model.train(train, dev)

    # Save model
    model.save_weights('./saves/test20.h5')
コード例 #13
0
def main():
    # create instance of config
    config = Config()
    config.dir_model = config.dir_output + "model.finetuning.weights/"

    # build model
    model = NERModel(config)
    model.build("fine_tuning")
    model.restore_session(config.dir_model)

    # create dataset
    if len(sys.argv) == 2:
        if sys.argv[1] == 'test':
            test = CoNLLDataset(config.filename_test,
                                config.processing_word,
                                config.processing_tag,
                                max_length=None)
        elif sys.argv[1] == 'dev':
            test = CoNLLDataset(config.filename_dev,
                                config.processing_word,
                                config.processing_tag,
                                max_length=None)
    else:
        assert len(sys.argv) == 1
        test = CoNLLDataset(config.filename_test,
                            config.processing_word,
                            config.processing_tag,
                            max_length=None)

    # evaluate and interact
    model.evaluate(test)
コード例 #14
0
def main():
    """Procedure to build data

    You MUST RUN this procedure. It iterates over the whole dataset (train,
    dev and test) and extract the vocabularies in terms of words, tags, and
    characters. Having built the vocabularies it writes them in a file. The
    writing of vocabulary in a file assigns an id (the line #) to each word.
    It then extract the relevant GloVe vectors and stores them in a np array
    such that the i-th entry corresponds to the i-th word in the vocabulary.


    Args:
        config: (instance of Config) has attributes like hyper-params...

    """
    # get config and processing of words
    config = Config(load=False)
    processing_word = get_processing_word(lowercase=True)

    # Generators
    dev = CoNLLDataset(config.filename_dev, processing_word, task=config.task)
    test = CoNLLDataset(config.filename_test,
                        processing_word,
                        task=config.task)
    train = CoNLLDataset(config.filename_train,
                         processing_word,
                         task=config.task)

    # Build Word and Tag vocab
    vocab_words, vocab_tags = get_vocabs([train, dev, test])
    vocab_glove = get_glove_vocab(config.filename_glove)
    #TODO get word2vec vocab too

    vocab = vocab_words & vocab_glove
    vocab.add(UNK)
    vocab.add(NUM)

    # Save word and tag vocab
    write_vocab(vocab, config.filename_words)
    write_vocab(vocab_tags, config.filename_tags)

    # write and trim GloVe and word2vec Vectors
    vocab = load_vocab(config.filename_words)
    write_word2vec_to_txtfile(config.path_to_word2vec_bin_file,
                              config.filename_word2vec)
    export_trimmed_word2vec_vectors(vocab, config.filename_word2vec,
                                    config.trimmed_word2vec_filename,
                                    config.dim_word)

    export_trimmed_glove_vectors(vocab, config.filename_glove,
                                 config.trimmed_glove_filename,
                                 config.dim_word)

    # Build and save char vocab
    train = CoNLLDataset(config.filename_train)
    vocab_chars = get_char_vocab(train)
    write_vocab(vocab_chars, config.filename_chars)
コード例 #15
0
def main():
    """Procedure to build data

    You MUST RUN this procedure. It iterates over the whole dataset (train,
    dev and test) and extract the vocabularies in terms of words, tags, and
    characters. Having built the vocabularies it writes them in a file. The
    writing of vocabulary in a file assigns an id (the line #) to each word.
    It then extract the relevant GloVe vectors and stores them in a np array
    such that the i-th entry corresponds to the i-th word in the vocabulary.


    Args:
        config: (instance of Config) has attributes like hyper-params...

    """
    # get config and processing of words
    config = Config(load=False)
    processing_word = get_processing_word(lowercase=True)  # 把字符全部小写,数字替换成NUM

    # Generators
    dev = CoNLLDataset(config.filename_dev,
                       processing_word)  # 创建一个生成器对象,每一次迭代产生tuple (words,tags)
    test = CoNLLDataset(config.filename_test,
                        processing_word)  # 返回一句话(words),和标签tags
    train = CoNLLDataset(config.filename_train, processing_word)

    #进一步处理数据

    # Build Word and Tag vocab
    vocab_words, vocab_tags = get_vocabs([train, dev, test])  # word词表, tags表
    print(len(vocab_words))

    vocab_glove = get_glove_vocab(config.filename_glove)  # glove词表

    vocab = vocab_words & vocab_glove  # & 求交集  set,都是集合
    vocab.add(UNK)
    vocab.add(NUM)  # 手动添加
    print("len of vocab without entity: ", len(vocab))

    # vocab_entity = entity2vocab(datasets=[train, dev, test])
    # vocab.update(vocab_entity)
    # vocab = entity2vocab(datasets=[train, dev], vocab=vocab)

    # Save vocab
    write_vocab(vocab, config.filename_words)
    write_vocab(vocab_tags, config.filename_tags)

    # Trim GloVe Vectors
    vocab = load_vocab(config.filename_words)  # 得到dict类型的vocab:{word:index}
    # 针对vocab,生成numpy的embedding文件,包含一个矩阵,对应词嵌入
    export_trimmed_glove_vectors(vocab, config.filename_glove,
                                 config.filename_trimmed, config.dim_word)

    # Build and save char vocab   生成字母表, 这里没用到小写化的东西。只有文件本身。
    train = CoNLLDataset(config.filename_train)
    vocab_chars = get_char_vocab(train)
    write_vocab(vocab_chars, config.filename_chars)
コード例 #16
0
ファイル: train.py プロジェクト: ml-lab/MLMA
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--train_lang', type=str, default='en')
    parser.add_argument('--dev_lang', type=str, default='en')
    parser.add_argument('--test_lang', type=str, default='en')
    parser.add_argument('--is_pos', type=int, default=0, help='NER or POS?')

    parser.add_argument('--dataset', type=str, default='conll2003', help='Dataset directory')
    parser.add_argument('--dir', type=str, default=None, help='Output directory')
    parser.add_argument('--use_chars', type=int, default=1, help='Use character LSTM or not')
    parser.add_argument('--epoch', type=int, default=30)

    parser.add_argument('--emb_type', type=str, default='word', help='word | trans | word_trans')
    parser.add_argument('--emb_dim', type=int, default=300, help='Dimension of word embeddings')
    parser.add_argument('--model_dir', type=str, default='data/output_model_config_fb_wikitext103/', help='Transformer directory model')
    parser.add_argument('--layer', type=int, default=None, help='Select a single layer from Transformer')
    parser.add_argument('--trans_concat', type=str, default='all', help='all | sws | fws')
    parser.add_argument('--trans_dim', type=int, default=512, help='Transformer hidden size')
    parser.add_argument('--trans_layer', type=int, default=7, help='The total number of Transformer layers')

    parser.add_argument('--trans_type', type=str, default='monolingual', help="monolingual | crosslingual")
    parser.add_argument('--trans_vocab_src', type=str, default=None, help='Source language Transformer vocabulary')
    parser.add_argument('--trans_vocab_tgt', type=str, default=None, help='Target language Transformer vocabulary')

    args = parser.parse_args()

    # with tf.device('/cpu:0'):

    # create instance of config
    # print(args.use_attn, type(args.use_attn))

    langs = [args.train_lang, args.dev_lang, args.test_lang]
    #config = Config(mix_vocab=args.mix_vocab, use_crf=args.use_crf, mono_trans=args.mono_trans, is_pos=args.is_pos, emb_dim=args.emb_dim, src_lang=args.train_lang, tgt_lang=args.test_lang, no_glove=args.no_glove, select_layer=args.select_layer, weighted_sum_full=args.weighted_sum_full, naive_proj=args.naive_proj, highway=args.highway, weighted_sum=args.trans_weighted_sum, trans_dim=args.trans_dim, dataset=args.dataset, trans_vocab=args.trans_vocab, use_transformer=args.use_trans, dir_=args.dir, use_chars=args.use_chars, use_attn=args.use_attn, char_init=args.char_init, model_dir=args.model_dir, trans_to_output=args.trans_to_output, epoch=args.epoch)

    config = Config(args)

    # create datasets
    dev   = CoNLLDataset(config.filename_dev, config.processing_word,
                         config.processing_tag, config.max_iter, lang=args.dev_lang)
    train = CoNLLDataset(config.filename_train, config.processing_word,
                         config.processing_tag, config.max_iter, lang=args.train_lang)
    test  = CoNLLDataset(config.filename_test, config.processing_word,
                         config.processing_tag, config.max_iter, lang=args.test_lang)

    #n_vocab = len(config.vocab_trans)
    #n_ctx = max([dev.max_seq, train.max_seq, test.max_seq])

    # with tf.device('/cpu:0'):
    # build model

    model = NERModel(config)
    model.build()
    # model.restore_session("results/crf/model.weights/") # optional, restore weights
    # model.reinitialize_weights("proj")

    # train model
    model.train(train, dev)
コード例 #17
0
def main():
    """Procedure to build data

    You MUST RUN this procedure. It iterates over the whole dataset (train,
    dev and test.py) and extract the vocabularies in terms of words, tags, and
    characters. Having built the vocabularies it writes them in a file. The
    writing of vocabulary in a file assigns an id (the line #) to each word.
    It then extract the relevant GloVe vectors and stores them in a np array
    such that the i-th entry corresponds to the i-th word in the vocabulary.


    Args:
        config: (instance of Config) has attributes like hyper-params...

    """
    # get config and processing of words
    config = Config(load=False)
    processing_word = get_processing_word(lowercase=False)

    # Generators
    dev = CoNLLDataset(config.filename_dev, processing_word)
    # test.py  = CoNLLDataset(config.filename_test, processing_word)  后面需要吧测试集的 也加进来
    train = CoNLLDataset(config.filename_train, processing_word)

    # Build Word and Tag vocab
    vocab_words, vocab_tags = get_vocabs([train, dev])
    # 这里先不加 get_glove_vocab
    # vocab_glove = get_glove_vocab(config.filename_glove)

    # vocab = vocab_words & vocab_glove
    vocab = vocab_words

    vocab.add(UNK)
    vocab.add(PAD)
    vocab.add(NUM)

    # Save vocab
    write_vocab(vocab, config.filename_words)
    write_vocab(vocab_tags, config.filename_tags)

    # Trim GloVe Vectors
    # vocab = load_vocab(config.filename_words)
    # export_trimmed_glove_vectors(vocab, config.filename_glove,config.filename_trimmed, config.dim_word)

    # Build and save char vocab
    train = CoNLLDataset(config.filename_train)
    vocab_chars_train = get_char_vocab(train)

    dev = CoNLLDataset(config.filename_dev)
    vocab_chars_dev = get_char_vocab(dev)
    vocab_chars_train_dev = list(vocab_chars_dev & vocab_chars_train)
    vocab_chars = [UNK, PAD, NUM]
    vocab_chars.extend(vocab_chars_train_dev)

    write_vocab(vocab_chars, config.filename_chars)
コード例 #18
0
def main(i, al, filenameextra):
    #Call in an iterator
    # create instance of config
    #config = Config()
    print("********Active training round ", i)
    # Initialize creating dataset
    # create datasets
    train_round = None
    select = None
    dev = CoNLLDataset(config.filename_dev, config.processing_word,
                       config.processing_tag,
                       config.max_iter)  #always keep the same dev and test

    test = CoNLLDataset(config.filename_test, config.processing_word,
                        config.processing_tag, config.max_iter)

    if (i == 1):
        train = CoNLLDataset(config.filename_train, config.processing_word,
                             config.processing_tag, config.max_iter)

        train = list(train)
        train_round = train[0:config.num_query]
        select = train[config.num_query:len(train)]
    else:
        fn = open(config.filename_pkl + str(i), 'rb')
        train_round, select = pickle.load(fn)
        fn.close()

    print("Training size ", len(train_round))
    print("Number of left training samples ", len(select))
    modename = str(i) + "_" + al + "_" + filenameextra
    out = train_active(train_round, dev, test, select, config, modename)
    #sort select list based on scores

    if config.active_strategy == "cluster":
        print('Scores from cluster ', out)
    else:
        if al == 'mu' or al == "mg":
            select = [x for _, x in sorted(zip(out, select))
                      ]  #Sort based on output of selection
        elif al == 'lu':
            select = [x for _, x in sorted(zip(out, select), reverse=True)]
        elif al == 'rand':
            shuffle(select)

    num_samples = min(config.num_query, len(select))
    train_round += select[0:num_samples]
    select = select[num_samples:len(select)]
    shuffle(train_round)
    shuffle(select)
    i = i + 1
    fo = open(config.filename_pkl + str(i), 'wb')
    pickle.dump((train_round, select), fo)
    fo.close()
コード例 #19
0
def main():
    """Procedure to build data

    You MUST RUN this procedure. It iterates over the whole dataset (train,
    dev and test) and extract the vocabularies in terms of words, tags, and
    characters. Having built the vocabularies it writes them in a file. The
    writing of vocabulary in a file assigns an id (the line #) to each word.
    It then extract the relevant GloVe vectors and stores them in a np array
    such that the i-th entry corresponds to the i-th word in the vocabulary.


    Args:
        config: (instance of Config) has attributes like hyper-params...

    """
    # get config and processing of words
    config = Config(load=False)
    if config.task == 'pos':
        print("USING POS")
        config.filename_train = "data/train.pos"  # test
        config.filename_dev = "data/dev.pos"
        config.filename_test = "data/test.pos"
    else:
        print("USING NER")
    processing_word = get_processing_word(lowercase=True)

    # Generators
    dev = CoNLLDataset(config.filename_dev, processing_word)
    test = CoNLLDataset(config.filename_test, processing_word)
    train = CoNLLDataset(config.filename_train, processing_word)

    # Build Word and Tag vocab
    vocab_words, vocab_tags = get_vocabs([train, dev, test])
    vocab_glove = get_glove_vocab(config.filename_glove)

    vocab = vocab_words & vocab_glove
    vocab.add(UNK)
    vocab.add(NUM)

    # Save vocab
    write_vocab(vocab, config.filename_words)
    write_vocab(vocab_tags, config.filename_tags)

    # Trim GloVe Vectors
    vocab = load_vocab(config.filename_words)
    export_trimmed_glove_vectors(vocab, config.filename_glove,
                                 config.filename_trimmed, config.dim_word)

    # Build and save char vocab
    train = CoNLLDataset(config.filename_train)
    vocab_chars = get_char_vocab(train)
    write_vocab(vocab_chars, config.filename_chars)
コード例 #20
0
ファイル: utils.py プロジェクト: qiangofzju/basic_nlp
def train(config):
    # build model
    model = NERModel(config)
    model.build()

    # create datasets
    dev = CoNLLDataset(config.filename_dev, config.processing_word,
                       config.processing_tag, config.max_iter)
    train = CoNLLDataset(config.filename_train, config.processing_word,
                         config.processing_tag, config.max_iter)

    # train model
    model.train(train, dev)
コード例 #21
0
def main():
    """Procedure to build data

    You MUST RUN this procedure. It iterates over the whole dataset (train,
    dev and test) and extract the vocabularies in terms of words, tags, and
    characters. Having built the vocabularies it writes them in a file. The
    writing of vocabulary in a file assigns an id (the line #) to each word.
    It then extract the relevant GloVe vectors and stores them in a np array
    such that the i-th entry corresponds to the i-th word in the vocabulary.


    Args:
        config: (instance of Config) has attributes like hyper-params...

    """
    # get config and processing of words
    config = Config(load=False)
    processing_word = get_processing_word(lowercase=True)

    # Generators
    dev = CoNLLDataset(config.filename_dev, processing_word)
    test = CoNLLDataset(config.filename_test, processing_word)
    train = CoNLLDataset(config.filename_train, processing_word)

    # Build Word and Tag vocab
    vocab_words, vocab_tags = get_vocabs([train, dev, test])
    vocab_glove = get_glove_vocab(config.filename_glove)

    # 与glove中的词集合求交,只保留有向量的那些词
    vocab = vocab_words & vocab_glove
    vocab.add(UNK)
    vocab.add(NUM)

    # Save vocab, vocab: set()
    print("write vocab set to file: " + config.filename_words)
    write_vocab(vocab, config.filename_words)
    print("write vocab tags set to file: " + config.filename_tags)
    write_vocab(vocab_tags, config.filename_tags)

    # Trim GloVe Vectors, 只加载那些在词集合中出现过的词向量
    vocab_to_index_dict = load_vocab(config.filename_words)
    # vocab: dict, vocab[word] = word_index
    print("export trimmed vocab embedding to file: " + config.filename_trimmed)
    export_trimmed_glove_vectors(vocab_to_index_dict, config.filename_glove,
                                 config.filename_trimmed, config.dim_word)

    # Build and save char vocab
    train = CoNLLDataset(config.filename_train)
    vocab_chars = get_char_vocab(train)
    print("save char set to file:" + config.filename_chars)
    write_vocab(vocab_chars, config.filename_chars)
コード例 #22
0
def main():
    # get config and processing of words
    config = Config(load=False)
    processing_word = get_processing_word(lowercase=True)
    processing_pos = get_processing_word()
    processing_chunk = get_processing_word()
    # Generators
    dev = CoNLLDataset(config.filename_dev, processing_word, processing_pos,
                       processing_chunk)
    test = CoNLLDataset(config.filename_test, processing_word, processing_pos,
                        processing_chunk)
    train = CoNLLDataset(config.filename_train, processing_word,
                         processing_pos, processing_chunk)

    # Build Word and Tag vocab
    vocab_words, vocab_tags, vocab_poses, vocab_chunks = get_vocabs(
        [train, dev, test])
    vocab_glove = get_glove_vocab(config.filename_glove)

    vocab = [i for i in vocab_words if i in vocab_glove]
    vocab.append(UNK)
    vocab.append(NUM)
    vocab.append("$pad$")
    vocab_poses.append("$pad$")
    vocab_chunks.append("$pad$")
    vocab_tags.append("$pad$")

    # Save vocab
    write_vocab(vocab, config.filename_words)
    write_vocab(vocab_tags, config.filename_tags)
    write_vocab(vocab_poses, config.filename_poses)
    write_vocab(vocab_chunks, config.filename_chunks)

    # Trim GloVe Vectors
    vocab = load_vocab(config.filename_words)
    print(len(vocab))
    export_trimmed_glove_vectors(vocab, config.filename_glove,
                                 config.filename_trimmed, config.dim_word)

    vocab = load_vocab(config.filename_poses)
    export_trimed_ont_hot_vectors(vocab, config.filename_pos_trimmed)

    vocab = load_vocab(config.filename_chunks)
    export_trimed_ont_hot_vectors(vocab, config.filename_chunk_trimmed)

    # Build and save char vocab
    train = CoNLLDataset(config.filename_train)
    vocab_chars = get_char_vocab(train)
    vocab_chars.append("$pad$")
    write_vocab(vocab_chars, config.filename_chars)
コード例 #23
0
def main():
    """Procedure to build data

    You MUST RUN this procedure. It iterates over the whole dataset (train,
    dev and test) and extract the vocabularies in terms of words, tags, and
    characters. Having built the vocabularies it writes them in a file. The
    writing of vocabulary in a file assigns an id (the line #) to each word.
    It then extract the relevant GloVe vectors and stores them in a np array
    such that the i-th entry corresponds to the i-th word in the vocabulary.


    Args:
        config: (instance of Config) has attributes like hyper-params...

    """
    # get config and processing of words
    config = Config(load=False)
    processing_word = get_processing_word(lowercase=True)

    # Generators
    dev = CoNLLDataset(config.filename_dev, processing_word)
    test = CoNLLDataset(config.filename_test, processing_word)
    train = CoNLLDataset(config.filename_train, processing_word)

    # Build Word and Tag vocab (only from train!)
    vocab_words, vocab_freqs, vocab_tags = get_vocabs([train])  #, dev, test])
    vocab_glove = get_glove_vocab(config.filename_glove)

    vocab = vocab_words & vocab_glove
    #vocab = make_unks(vocab, vocab_freqs, config.p_unk)
    #vocab.add(UNK)
    vocab.add(NUM)
    vocab = [UNK] + list(vocab)

    # Save vocab
    write_vocab(vocab, config.filename_words)
    write_vocab(vocab_tags, config.filename_tags)
    # get singletons
    singletons = [k for k, v in vocab_freqs.items() if v == 1]
    write_vocab(singletons, config.filename_singletons)

    # Trim GloVe Vectors
    vocab = load_vocab(config.filename_words)
    export_trimmed_glove_vectors(vocab, config.filename_glove,
                                 config.filename_trimmed, config.dim_word)

    # Build and save char vocab
    train = CoNLLDataset(config.filename_train)
    vocab_chars = get_char_vocab(train)
    write_vocab(vocab_chars, config.filename_chars)
コード例 #24
0
ファイル: utils.py プロジェクト: qiangofzju/basic_nlp
def build(config):
    """Procedure to build data

    You MUST RUN this procedure. It iterates over the whole dataset (train,
    dev ) and extract the vocabularies in terms of words, tags, and
    characters. Having built the vocabularies it writes them in a file. The
    writing of vocabulary in a file assigns an id (the line #) to each word.
    It then extract the relevant GloVe vectors and stores them in a np array
    such that the i-th entry corresponds to the i-th word in the vocabulary.


    Args:
        config: (instance of Config) has attributes like hyper-params...

    """
    # get config and processing of words
    # config = Config(load=False, args=args)
    processing_word = get_processing_word(lowercase=True)

    # Generators
    train = CoNLLDataset(config.filename_train, processing_word)

    vocab, _ = get_vocabs([train], config.min_count)
    vocab.insert(0, UNK)

    special_flag = [NUM, NUU, FLT, FLU]
    for index, flag in enumerate(special_flag, 1):
        if flag in vocab:
            vocab.remove(flag)
        vocab.insert(index, flag)

    # Generators
    dev = CoNLLDataset(config.filename_dev, processing_word)
    # test = CoNLLDataset(config.filename_test, processing_word)
    train = CoNLLDataset(config.filename_train, processing_word)

    # Build Word and Tag vocab
    _, vocab_tags = get_vocabs([train, dev])

    # Save vocab
    write_vocab(vocab, config.filename_words)
    write_vocab(vocab_tags, config.filename_tags)

    # Build and save char vocab
    train = CoNLLDataset(config.filename_train)
    vocab_chars = get_char_vocab(train)
    vocab_chars.insert(0, UNK)

    write_vocab(vocab_chars, config.filename_chars)
コード例 #25
0
def main():
    # create instance of config
    config = Config()

    pretrain_path = "/home/yinghong/project/tmp/s_t/ray_results/final/exp-final-epoch30" \
                    "/train_func_0_2018-06-16_01-24-13vmtghosb"

    config_path = os.path.join(pretrain_path, "params.json")
    with open(config_path) as fin:
        content = fin.read().replace('\n', '')
        import json
        j = json.loads(content)
        for (key, val) in j.items():
            setattr(config, key, val)

    # build model
    model = NERModel(config)
    model.build()

    model.restore_session(
        os.path.join(
            pretrain_path, "results/tmptmptest/bz=10-training-"
            "bieo-nocnn/model.weights/"))

    # create dataset
    # test  = CoNLLDataset(config.filename_test, config.processing_word,
    #                      config.processing_tag, config.max_iter)
    dev = CoNLLDataset(config.filename_dev, config.processing_word,
                       config.processing_tag, config.max_iter)

    # evaluate and interact
    model.tmp(dev, outfile="result-dev.txt")
    interactive_shell(model)
コード例 #26
0
def main():
    # create instance of config
    config = Config()

    config.dim_char = arg.dim_char
    config.hidden_size_char = arg.hidden_size_char
    config.hidden_size_lstm_1 = arg.hidden_size_lstm_1
    config.hidden_size_lstm_2 = arg.hidden_size_lstm_2
    config.batch_sample = arg.batch_sample
    config.elmo_scale = arg.elmo_scale
    config.lr_method = arg.lr_method
    config.batch_size = arg.batch_size
    config.learning_rate = arg.learning_rate
    config.decay_logic = arg.decay_logic
    config.run_name = arg.run_name

    # build model
    model = NERModel(config)
    model.build()
    model.restore_session(config.dir_model + config.run_name + '/')

    # create dataset
    test = CoNLLDataset(config.filename_test, config.elmofile_test,
                        config.processing_word, config.processing_postags,
                        config.generate_anchor, config.max_iter)
    model.evaluate(test)
コード例 #27
0
def main():
    # create instance of config
    config = Config()
    if config.use_elmo: config.processing_word = None

    #build model
    model = NERModel(config)

    # create datasets
    dev = CoNLLDataset(config.filename_dev, config.processing_word,
                         config.processing_tag, config.max_iter, config.use_crf)
    train = CoNLLDataset(config.filename_train, config.processing_word,
                         config.processing_tag, config.max_iter, config.use_crf)

    learn = NERLearner(config, model)
    learn.fit(train, dev)
コード例 #28
0
ファイル: evaluate_arg.py プロジェクト: ttklm20/CINEX
def main():
    # create instance of config
    dir_output = "./results/" + sys.argv[2] + "/"
    config = Config(dir_output, load=False)

    config.filename_words = "./data/words_" + sys.argv[2] + ".txt"
    config.filename_chars = "./data/chars_" + sys.argv[2] + ".txt"
    config.filename_tags = "./data/tags_" + sys.argv[2] + ".txt"

    #config.dir_output = "./results/" + sys.argv[2] + "/"
    config.dir_model = config.dir_output + "model.weights/"
    config.path_log = config.dir_output + "log.txt"

    #config.filename_dev = sys.argv[1]
    config.filename_test = sys.argv[1]
    #config.filename_train = sys.argv[3]
    config.filename_pred = sys.argv[1].replace(".txt", ".pred")

    config.load()

    # build model
    model = NERModel(config)
    model.build()
    model.restore_session(config.dir_model)

    # create dataset
    #test  = CoNLLDataset(config.filename_test, config.processing_word,
    #                     config.processing_tag, config.max_iter)

    test = CoNLLDataset(sys.argv[1], config.processing_word,
                        config.processing_tag, config.max_iter)

    # evaluate and interact
    model.evaluate(test)
コード例 #29
0
def main():
    # Предсказания моделью первого уровня #
    config_first = Config(dir_output='./results/train_first/')
    model = NERModel(config_first)
    model.build()
    model.restore_session(config_first.dir_model)
    test = CoNLLDataset(config_first.filename_test,
                        config_first.processing_word,
                        config_first.processing_tag, config_first.max_iter)

    print()
    print('Predicting first stage!')
    model.evaluate(test)
    print()

    test_predictions = model.predict_test(test)
    formatted_predictions = format_predictions(test_predictions, 'test',
                                               config_first)

    # Предсказания моделью второго уровня #
    tf.reset_default_graph()
    config_second = Config(dir_output='./results/train_second/')
    model = NERModel2(config_second)
    model.build()
    model.restore_session(config_second.dir_model)

    print()
    print('Predicting second stage!')
    model.evaluate(formatted_predictions)
    print()
コード例 #30
0
def main():
    # create instance of config
    config = Config()

    # build model
    model = NERModel(config)
    model.build()
    # model.restore_session("results/crf/model.weights/") # optional, restore weights
    # model.reinitialize_weights("proj")

    # create datasets
    dev = CoNLLDataset(config.filename_dev, max_iter=config.max_iter)
    train = CoNLLDataset(config.filename_train, max_iter=config.max_iter)

    # train model
    model.train(train, dev)