Example #1
0
    'cache',
    (args.dataset + '_lattice' + '_only_train:{}' + '_trainClip:{}' +
     '_norm_num:{}' + 'char_min_freq{}' + 'bigram_min_freq{}' +
     'word_min_freq{}' + 'only_train_min_freq{}' +
     'number_norm{}' + 'lexicon_{}' + 'load_dataset_seed_{}').format(
         args.only_lexicon_in_train, args.train_clip, args.number_normalized,
         args.char_min_freq, args.bigram_min_freq, args.word_min_freq,
         args.only_train_min_freq, args.number_normalized, args.lexicon_name,
         load_dataset_seed))
datasets, vocabs, embeddings = equip_chinese_ner_with_lexicon(
    datasets,
    vocabs,
    embeddings,
    w_list,
    yangjie_rich_pretrain_word_path,
    _refresh=refresh_data,
    _cache_fp=cache_name,
    only_lexicon_in_train=args.only_lexicon_in_train,
    word_char_mix_embedding_path=yangjie_rich_pretrain_char_and_word_path,
    number_normalized=args.number_normalized,
    lattice_min_freq=args.lattice_min_freq,
    only_train_min_freq=args.only_train_min_freq)

print('train:{}'.format(len(datasets['train'])))
avg_seq_len = 0
avg_lex_num = 0
avg_seq_lex = 0
train_seq_lex = []
dev_seq_lex = []
test_seq_lex = []
train_seq = []
Example #2
0
         args.char_min_freq, args.bigram_min_freq, args.word_min_freq,
         args.only_train_min_freq, args.number_normalized, args.lexicon_name,
         load_dataset_seed))
'''
vocabs['lattice'] 这个就是在 equip_chinese_ner_with_lexicon() 方法中生成的,其中的内容就是词语
'''
datasets, vocabs, embeddings = equip_chinese_ner_with_lexicon(
    datasets,
    vocabs,
    embeddings,  # 传入刚才读取的embedding
    w_list,  # 这个也需要根据tianchi任务进行修改
    word_embedding_path=None,
    # yangjie_rich_pretrain_word_path,  # 用的是这个词表的embedding
    _refresh=refresh_data,
    _cache_fp=cache_name,
    only_lexicon_in_train=args.only_lexicon_in_train,
    #原论文中是使用的下面这个路径的词典,但是针对天池数据,
    #我将其设置 cn-sgns-literature-word ,也是一个与训练的包,可能会含有自己
    #需要的关键词多一些
    #word_char_mix_embedding_path=yangjie_rich_pretrain_char_and_word_path,
    #word_char_mix_embedding_path=tianchi_pretrain_words_path,
    word_char_mix_embedding_path='cn-sgns-literature-word',
    number_normalized=args.number_normalized,
    lattice_min_freq=args.lattice_min_freq,
    only_train_min_freq=args.only_train_min_freq)
print(vocabs['lattice'])

print('train:{}'.format(len(datasets['train'])))
avg_seq_len = 0
avg_lex_num = 0
avg_seq_lex = 0