Beispiel #1
0
args.CRF = True
args.hidden_dim = 512
args.isload2train = False
# build char embeddings
if not args.use_pre_emb:
    # vocabulary build
    if not os.path.exists(
            os.path.join('data_path', args.dataset_name, 'word2id.pkl')):
        #原始数据集:txt文件  字1\ttag1\n
        #                  字2\ttag2\n
        # line1                    ...
        #                  字n\ttagn\n             需要注意的是每个line 用两个换行符隔开
        #                       \n
        # line2                 同上
        vocab_build(
            os.path.join('data_path', args.dataset_name, 'word2id.pkl'),
            os.path.join('data_path', args.dataset_name, train_file))

    # get word dictionary
    word2id = read_dictionary(
        os.path.join('data_path', args.dataset_name, 'word2id.pkl'))
    embeddings = random_embedding(word2id, args.embedding_dim)
    log_pre = 'not_use_pretrained_embeddings'
else:
    with open('data_path//DaGuang//dr_d_td_all.pkl', 'rb') as f:
        id2word = pickle.load(f)
        word2id = pickle.load(f)
        print('word2id的length:', len(word2id))
        _ = pickle.load(f)
    embeddings_path = os.path.join('data_path', args.dataset_name,
                                   'pretrain_embedding.npy')
Beispiel #2
0
parser.add_argument('--shuffle',
                    type=str2bool,
                    default=True,
                    help='shuffle training data before each epoch')
parser.add_argument('--mode',
                    type=str,
                    default='train',
                    help='train/test/demo')
parser.add_argument('--demo_model',
                    type=str,
                    default='1521112368',
                    help='model for test and demo')
args = parser.parse_args()

# Creating .pkl file
vocab_build(Path + '\\word2id.pkl', Path + '\\vocab.txt', 3)

# get char embeddings
word2id = read_dictionary(Path + '\\word2id.pkl')
if args.pretrain_embedding == 'random':
    embeddings = random_embedding(word2id, args.embedding_dim)
else:
    embedding_path = 'pretrain_embedding.npy'
    embeddings = np.array(np.load(embedding_path), dtype='float32')

## read corpus and get training data
if args.mode != 'demo':
    train_path = 'D:\\resource\\general_hypernym_extraction\\data\\train.txt'
    test_path = 'D:\\resource\\general_hypernym_extraction\\data\\valid.txt'
    train_data = read_corpus(train_path)
    test_data = read_corpus(test_path)
Beispiel #3
0
parser.add_argument('--optimizer', type=str, default='Adam', help='Adam/Adadelta/Adagrad/RMSProp/Momentum/SGD')
parser.add_argument('--CRF', type=str2bool, default=True, help='use CRF at the top layer. if False, use Softmax')
parser.add_argument('--lr', type=float, default=0.001, help='learning rate')
parser.add_argument('--clip', type=float, default=5.0, help='gradient clipping')
parser.add_argument('--dropout', type=float, default=0.6, help='dropout keep_prob')
parser.add_argument('--update_embedding', type=str2bool, default=True, help='update embedding during training')
parser.add_argument('--pretrain_embedding', type=str, default='random', help='use pretrained char embedding or init it randomly')
parser.add_argument('--embedding_dim', type=int, default=300, help='random init char embedding_dim')
parser.add_argument('--shuffle', type=str2bool, default=True, help='shuffle training data before each epoch')
parser.add_argument('--mode', type=str, default='train', help='train/test/demo')
parser.add_argument('--demo_model', type=str, default='1521112368', help='model for test and demo')
args = parser.parse_args()


#建立字典
vocab_build(args.train_data, args.train_data, 1)


#进行字符编码,即字向量
word2id = read_dictionary(os.path.join('.', args.train_data, 'word2id.pkl'))
if args.pretrain_embedding == 'random':
    embeddings = random_embedding(word2id, args.embedding_dim)
else:
    embedding_path = 'pretrain_embedding.npy'
    embeddings = np.array(np.load(embedding_path), dtype='float32')


## read corpus and get training data
if args.mode != 'demo':
    train_path = os.path.join('.', args.train_data, 'train_data')
    test_path = os.path.join('.', args.test_data, 'test_data')
Beispiel #4
0
                            tuple_list.append(tagg_else)
                            h+=1
                except Exception as e:
                    print(temp)
                    continue
                #tuple_list.append(temp)
            #print(tuple_list)
            for tagg_item in tuple_list:
                #print(tagg_item)
                w.write('\t'.join(tagg_item)+'\n')
            w.write('\n')
    f.close()
    print("处理结束,一共" + str(i) + "行")

#================================================================================

"""
sens_cut("./corpus/car_ner_corpus.xlsx","./data_path/original/combine_data.txt","合并")
sens_cut("./corpus/car_ner_corpus.xlsx","./data_path/original/train_data.txt","train")
sens_cut("./corpus/car_ner_corpus.xlsx","./data_path/original/test_data.txt","test")
words_label('./data_path/original/combine_data.txt','./data_path/original/combine_tagged.txt')

words_label('./data_path/original/train_data.txt','./data_path/original/train_tagged.txt')
words_label('./data_path/original/test_data.txt','./data_path/original/test_tagged.txt')
"""
data_format('./data_path/original/combine_tagged.txt','./data_path/combine_format')
data_format('./data_path/original/train_tagged.txt','./data_path/train_data')
data_format('./data_path/original/test_tagged.txt','./data_path/test_data')

data.vocab_build('./data_path/word2id.pkl','./data_path/combine_format',5)
#生成pkl文件
import data
# import re
file = 'highmath_data'

data.vocab_build('./' + file + '/word2id.pkl', './' + file + '/ner_train_data',
                 1)
Beispiel #6
0
parser.add_argument('--CRF', type=str2bool, default=True, help='use CRF at the top layer. if False, use Softmax')
parser.add_argument('--lr', type=float, default=0.001, help='learning rate')
parser.add_argument('--clip', type=float, default=5.0, help='gradient clipping')
parser.add_argument('--dropout', type=float, default=0.5, help='dropout keep_prob')
parser.add_argument('--update_embedding', type=str2bool, default=True, help='update embedding during training')
parser.add_argument('--pretrain_embedding', type=str, default='random', help='use pretrained char embedding or init it randomly')
parser.add_argument('--embedding_dim', type=int, default=300, help='random init char embedding_dim')
parser.add_argument('--shuffle', type=str2bool, default=True, help='shuffle training data before each epoch')
parser.add_argument('--mode', type=str, default='train', help='train/test/demo/all/all_2')
parser.add_argument('--demo_model', type=str, default='1521112368', help='model for test and demo')
args = parser.parse_args()


## get char embeddings
if not os.path.exists(os.path.join('.', args.train_data, 'word2id.pkl')):
    vocab_build(os.path.join('.', args.train_data, 'word2id.pkl'), os.path.join('.', args.train_data, 'train_data'), 5)
word2id = read_dictionary(os.path.join('.', args.train_data, 'word2id.pkl'))

if args.pretrain_embedding == 'random':
    embeddings = random_embedding(word2id, args.embedding_dim, os.path.join('.', args.train_data, 'all_test'))
else:
    embedding_path = 'pretrain_embedding.npy'
    embeddings = np.array(np.load(embedding_path), dtype='float32')


## read corpus and get training data
if args.mode != 'demo':
    train_path = os.path.join('.', args.train_data, 'train_data')
    test_path = os.path.join('.', args.test_data, 'test_data')
    train_data = read_corpus(train_path)
    test_data = read_corpus(test_path)
Beispiel #7
0

## tags, BIO
tag2label = {"O": 0,
             "B-PER": 1, "I-PER": 2,
             "B-LOC": 3, "I-LOC": 4,
             "B-ORG": 5, "I-ORG": 6,
             "B-TIM": 7, "I-TIM": 8,
            }


## get char embeddings
if args.pretrain_embedding == 'random': # 使用隨機向量
    vocab_path = os.path.join('.', args.train_data, 'word2id.pkl')
    corpus_path = os.path.join('.', args.train_data, 'train_data')
    word2id = vocab_build(vocab_path, corpus_path, min_count=1) # 將每個字給定id
    embeddings = random_embedding(word2id, args.embedding_dim) # 使用隨機向量
else:
    vocab_path = os.path.join('.', args.train_data, 'word2id.pkl')
    embedding_path = os.path.join('.', 'embedding', args.pretrain_embedding) 
    word2id, embeddings = read_pretrain_embedding(vocab_path, embedding_path) # 將每個字給定id,使用預訓練向量


## read corpus and get training data
if args.mode != 'demo':
    train_path = os.path.join('.', args.train_data, 'train_data')
    test_path = os.path.join('.', args.test_data, 'test_data')
    train_data = read_corpus(train_path)
    test_data = read_corpus(test_path); test_size = len(test_data)

Beispiel #8
0
                    help='random init char embedding_dim')
parser.add_argument('--shuffle',
                    type=str2bool,
                    default=True,
                    help='shuffle training data before each epoch')
parser.add_argument('--mode', type=str, default='demo', help='train/test/demo')
parser.add_argument('--demo_model',
                    type=str,
                    default='1521112368',
                    help='model for test and demo')
args = parser.parse_args()

# vocabulary build
if not os.path.exists(
        os.path.join('data_path', args.dataset_name, 'word2id.pkl')):
    vocab_build(os.path.join('data_path', args.dataset_name, 'word2id.pkl'),
                os.path.join('data_path', args.dataset_name, 'train_data.txt'))

# get word dictionary
word2id = read_dictionary(
    os.path.join('data_path', args.dataset_name, 'word2id.pkl'))

# build char embeddings
if not args.use_pre_emb:
    embeddings = random_embedding(word2id, args.embedding_dim)
    log_pre = 'not_use_pretrained_embeddings'
else:
    pre_emb_path = os.path.join('.', args.pretrained_emb_path)
    embeddings_path = os.path.join('data_path', args.dataset_name,
                                   'pretrain_embedding.npy')
    if not os.path.exists(embeddings_path):
        build_character_embeddings(pre_emb_path, embeddings_path, word2id,