args.CRF = True args.hidden_dim = 512 args.isload2train = False # build char embeddings if not args.use_pre_emb: # vocabulary build if not os.path.exists( os.path.join('data_path', args.dataset_name, 'word2id.pkl')): #原始数据集:txt文件 字1\ttag1\n # 字2\ttag2\n # line1 ... # 字n\ttagn\n 需要注意的是每个line 用两个换行符隔开 # \n # line2 同上 vocab_build( os.path.join('data_path', args.dataset_name, 'word2id.pkl'), os.path.join('data_path', args.dataset_name, train_file)) # get word dictionary word2id = read_dictionary( os.path.join('data_path', args.dataset_name, 'word2id.pkl')) embeddings = random_embedding(word2id, args.embedding_dim) log_pre = 'not_use_pretrained_embeddings' else: with open('data_path//DaGuang//dr_d_td_all.pkl', 'rb') as f: id2word = pickle.load(f) word2id = pickle.load(f) print('word2id的length:', len(word2id)) _ = pickle.load(f) embeddings_path = os.path.join('data_path', args.dataset_name, 'pretrain_embedding.npy')
parser.add_argument('--shuffle', type=str2bool, default=True, help='shuffle training data before each epoch') parser.add_argument('--mode', type=str, default='train', help='train/test/demo') parser.add_argument('--demo_model', type=str, default='1521112368', help='model for test and demo') args = parser.parse_args() # Creating .pkl file vocab_build(Path + '\\word2id.pkl', Path + '\\vocab.txt', 3) # get char embeddings word2id = read_dictionary(Path + '\\word2id.pkl') if args.pretrain_embedding == 'random': embeddings = random_embedding(word2id, args.embedding_dim) else: embedding_path = 'pretrain_embedding.npy' embeddings = np.array(np.load(embedding_path), dtype='float32') ## read corpus and get training data if args.mode != 'demo': train_path = 'D:\\resource\\general_hypernym_extraction\\data\\train.txt' test_path = 'D:\\resource\\general_hypernym_extraction\\data\\valid.txt' train_data = read_corpus(train_path) test_data = read_corpus(test_path)
parser.add_argument('--optimizer', type=str, default='Adam', help='Adam/Adadelta/Adagrad/RMSProp/Momentum/SGD') parser.add_argument('--CRF', type=str2bool, default=True, help='use CRF at the top layer. if False, use Softmax') parser.add_argument('--lr', type=float, default=0.001, help='learning rate') parser.add_argument('--clip', type=float, default=5.0, help='gradient clipping') parser.add_argument('--dropout', type=float, default=0.6, help='dropout keep_prob') parser.add_argument('--update_embedding', type=str2bool, default=True, help='update embedding during training') parser.add_argument('--pretrain_embedding', type=str, default='random', help='use pretrained char embedding or init it randomly') parser.add_argument('--embedding_dim', type=int, default=300, help='random init char embedding_dim') parser.add_argument('--shuffle', type=str2bool, default=True, help='shuffle training data before each epoch') parser.add_argument('--mode', type=str, default='train', help='train/test/demo') parser.add_argument('--demo_model', type=str, default='1521112368', help='model for test and demo') args = parser.parse_args() #建立字典 vocab_build(args.train_data, args.train_data, 1) #进行字符编码,即字向量 word2id = read_dictionary(os.path.join('.', args.train_data, 'word2id.pkl')) if args.pretrain_embedding == 'random': embeddings = random_embedding(word2id, args.embedding_dim) else: embedding_path = 'pretrain_embedding.npy' embeddings = np.array(np.load(embedding_path), dtype='float32') ## read corpus and get training data if args.mode != 'demo': train_path = os.path.join('.', args.train_data, 'train_data') test_path = os.path.join('.', args.test_data, 'test_data')
tuple_list.append(tagg_else) h+=1 except Exception as e: print(temp) continue #tuple_list.append(temp) #print(tuple_list) for tagg_item in tuple_list: #print(tagg_item) w.write('\t'.join(tagg_item)+'\n') w.write('\n') f.close() print("处理结束,一共" + str(i) + "行") #================================================================================ """ sens_cut("./corpus/car_ner_corpus.xlsx","./data_path/original/combine_data.txt","合并") sens_cut("./corpus/car_ner_corpus.xlsx","./data_path/original/train_data.txt","train") sens_cut("./corpus/car_ner_corpus.xlsx","./data_path/original/test_data.txt","test") words_label('./data_path/original/combine_data.txt','./data_path/original/combine_tagged.txt') words_label('./data_path/original/train_data.txt','./data_path/original/train_tagged.txt') words_label('./data_path/original/test_data.txt','./data_path/original/test_tagged.txt') """ data_format('./data_path/original/combine_tagged.txt','./data_path/combine_format') data_format('./data_path/original/train_tagged.txt','./data_path/train_data') data_format('./data_path/original/test_tagged.txt','./data_path/test_data') data.vocab_build('./data_path/word2id.pkl','./data_path/combine_format',5)
#生成pkl文件 import data # import re file = 'highmath_data' data.vocab_build('./' + file + '/word2id.pkl', './' + file + '/ner_train_data', 1)
parser.add_argument('--CRF', type=str2bool, default=True, help='use CRF at the top layer. if False, use Softmax') parser.add_argument('--lr', type=float, default=0.001, help='learning rate') parser.add_argument('--clip', type=float, default=5.0, help='gradient clipping') parser.add_argument('--dropout', type=float, default=0.5, help='dropout keep_prob') parser.add_argument('--update_embedding', type=str2bool, default=True, help='update embedding during training') parser.add_argument('--pretrain_embedding', type=str, default='random', help='use pretrained char embedding or init it randomly') parser.add_argument('--embedding_dim', type=int, default=300, help='random init char embedding_dim') parser.add_argument('--shuffle', type=str2bool, default=True, help='shuffle training data before each epoch') parser.add_argument('--mode', type=str, default='train', help='train/test/demo/all/all_2') parser.add_argument('--demo_model', type=str, default='1521112368', help='model for test and demo') args = parser.parse_args() ## get char embeddings if not os.path.exists(os.path.join('.', args.train_data, 'word2id.pkl')): vocab_build(os.path.join('.', args.train_data, 'word2id.pkl'), os.path.join('.', args.train_data, 'train_data'), 5) word2id = read_dictionary(os.path.join('.', args.train_data, 'word2id.pkl')) if args.pretrain_embedding == 'random': embeddings = random_embedding(word2id, args.embedding_dim, os.path.join('.', args.train_data, 'all_test')) else: embedding_path = 'pretrain_embedding.npy' embeddings = np.array(np.load(embedding_path), dtype='float32') ## read corpus and get training data if args.mode != 'demo': train_path = os.path.join('.', args.train_data, 'train_data') test_path = os.path.join('.', args.test_data, 'test_data') train_data = read_corpus(train_path) test_data = read_corpus(test_path)
## tags, BIO tag2label = {"O": 0, "B-PER": 1, "I-PER": 2, "B-LOC": 3, "I-LOC": 4, "B-ORG": 5, "I-ORG": 6, "B-TIM": 7, "I-TIM": 8, } ## get char embeddings if args.pretrain_embedding == 'random': # 使用隨機向量 vocab_path = os.path.join('.', args.train_data, 'word2id.pkl') corpus_path = os.path.join('.', args.train_data, 'train_data') word2id = vocab_build(vocab_path, corpus_path, min_count=1) # 將每個字給定id embeddings = random_embedding(word2id, args.embedding_dim) # 使用隨機向量 else: vocab_path = os.path.join('.', args.train_data, 'word2id.pkl') embedding_path = os.path.join('.', 'embedding', args.pretrain_embedding) word2id, embeddings = read_pretrain_embedding(vocab_path, embedding_path) # 將每個字給定id,使用預訓練向量 ## read corpus and get training data if args.mode != 'demo': train_path = os.path.join('.', args.train_data, 'train_data') test_path = os.path.join('.', args.test_data, 'test_data') train_data = read_corpus(train_path) test_data = read_corpus(test_path); test_size = len(test_data)
help='random init char embedding_dim') parser.add_argument('--shuffle', type=str2bool, default=True, help='shuffle training data before each epoch') parser.add_argument('--mode', type=str, default='demo', help='train/test/demo') parser.add_argument('--demo_model', type=str, default='1521112368', help='model for test and demo') args = parser.parse_args() # vocabulary build if not os.path.exists( os.path.join('data_path', args.dataset_name, 'word2id.pkl')): vocab_build(os.path.join('data_path', args.dataset_name, 'word2id.pkl'), os.path.join('data_path', args.dataset_name, 'train_data.txt')) # get word dictionary word2id = read_dictionary( os.path.join('data_path', args.dataset_name, 'word2id.pkl')) # build char embeddings if not args.use_pre_emb: embeddings = random_embedding(word2id, args.embedding_dim) log_pre = 'not_use_pretrained_embeddings' else: pre_emb_path = os.path.join('.', args.pretrained_emb_path) embeddings_path = os.path.join('data_path', args.dataset_name, 'pretrain_embedding.npy') if not os.path.exists(embeddings_path): build_character_embeddings(pre_emb_path, embeddings_path, word2id,